Commit ·
8f7c535
1
Parent(s): 44e368b
Update helper.py
Browse files
helper.py
CHANGED
|
@@ -65,9 +65,6 @@ def find_comptives_straight_patterns(sentence):
|
|
| 65 |
if next_token.text.lower() == "than":
|
| 66 |
prev_token = token.nbor(-1)
|
| 67 |
|
| 68 |
-
# this part is to check what will be before more/less. We can add a NOUN as mandatory (e.g magnitude) or even specifically the word magnitude
|
| 69 |
-
# for the moment we have disable it
|
| 70 |
-
|
| 71 |
if token.text.lower() == 'more':
|
| 72 |
comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
|
| 73 |
elif token.text.lower() == 'less':
|
|
@@ -283,6 +280,7 @@ def identify_bigger_smaller_advanced(sentence):
|
|
| 283 |
return bigger_list + smaller_list
|
| 284 |
|
| 285 |
|
|
|
|
| 286 |
def find_equal_to_comptives_ngrams(sentence):
|
| 287 |
"""
|
| 288 |
This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
|
|
@@ -347,23 +345,41 @@ def single_verb_comptives(sentence):
|
|
| 347 |
|
| 348 |
# search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
|
| 349 |
for token in doc:
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
final_list = bigger_list + smaller_list + equal_list
|
| 369 |
|
|
@@ -406,6 +422,7 @@ def cosine_sim(a, b):
|
|
| 406 |
return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
|
| 407 |
|
| 408 |
|
|
|
|
| 409 |
# we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
|
| 410 |
# (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
|
| 411 |
|
|
@@ -433,7 +450,7 @@ def multiword_verb_comptives(sentence):
|
|
| 433 |
matched_ngrams = set()
|
| 434 |
|
| 435 |
# Iterate through n-grams of sentence, starting with the largest n-grams
|
| 436 |
-
for n in range(5,
|
| 437 |
for i in range(len(tokens)-n+1):
|
| 438 |
ngram = ' '.join(tokens[i:i+n])
|
| 439 |
|
|
@@ -522,23 +539,41 @@ def identify_comparatives(sentence):
|
|
| 522 |
# return all the patterns that were captured
|
| 523 |
comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
|
| 524 |
|
| 525 |
-
# since those different techniques might capture similar patterns, we keep only unique references. More precisely
|
| 526 |
-
|
| 527 |
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
|
| 532 |
unique_output = list(unique_comparatives.values())
|
| 533 |
|
| 534 |
return unique_output
|
| 535 |
|
| 536 |
|
| 537 |
-
|
| 538 |
def comparatives_binding(sentence):
|
| 539 |
|
| 540 |
try:
|
| 541 |
comparative_symbols = find_comptives_symbols(sentence)
|
|
|
|
| 542 |
comparative_mentions = identify_comparatives(sentence)
|
| 543 |
|
| 544 |
# starting with the symbols, if one was captured
|
|
|
|
| 65 |
if next_token.text.lower() == "than":
|
| 66 |
prev_token = token.nbor(-1)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
| 68 |
if token.text.lower() == 'more':
|
| 69 |
comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
|
| 70 |
elif token.text.lower() == 'less':
|
|
|
|
| 280 |
return bigger_list + smaller_list
|
| 281 |
|
| 282 |
|
| 283 |
+
|
| 284 |
def find_equal_to_comptives_ngrams(sentence):
|
| 285 |
"""
|
| 286 |
This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
|
|
|
|
| 345 |
|
| 346 |
# search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
|
| 347 |
for token in doc:
|
| 348 |
+
|
| 349 |
+
# first examine for 1-1 pair matching and 1-1 lemma pair matching
|
| 350 |
+
if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg:
|
| 351 |
+
bigger_list.append({'comparative': [token.text, ">"]})
|
| 352 |
+
break
|
| 353 |
+
|
| 354 |
+
elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg:
|
| 355 |
+
smaller_list.append({'comparative': [token.text, "<"]})
|
| 356 |
+
break
|
| 357 |
+
|
| 358 |
+
elif token.text in equal_references_sg or token.lemma_ in equal_references_sg:
|
| 359 |
+
equal_list.append({'comparative': [token.text, "="]})
|
| 360 |
+
break
|
| 361 |
+
|
| 362 |
+
else:
|
| 363 |
+
|
| 364 |
+
# if not, then try with synonyms only for verbs
|
| 365 |
+
if token.pos_ == "VERB":
|
| 366 |
+
|
| 367 |
+
for lemma in token.lemma_.split('|'):
|
| 368 |
+
synsets = wordnet.synsets(lemma, pos='v')
|
| 369 |
+
|
| 370 |
+
for syn in synsets:
|
| 371 |
+
if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
|
| 372 |
+
bigger_list.append({'comparative': [token.text, ">"]})
|
| 373 |
+
break
|
| 374 |
+
|
| 375 |
+
elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
|
| 376 |
+
smaller_list.append({'comparative': [token.text, "<"]})
|
| 377 |
+
break
|
| 378 |
+
|
| 379 |
+
elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
|
| 380 |
+
equal_list.append({'comparative': [token.text, "="]})
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
|
| 384 |
final_list = bigger_list + smaller_list + equal_list
|
| 385 |
|
|
|
|
| 422 |
return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
|
| 423 |
|
| 424 |
|
| 425 |
+
|
| 426 |
# we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
|
| 427 |
# (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
|
| 428 |
|
|
|
|
| 450 |
matched_ngrams = set()
|
| 451 |
|
| 452 |
# Iterate through n-grams of sentence, starting with the largest n-grams
|
| 453 |
+
for n in range(5, 0, -1):
|
| 454 |
for i in range(len(tokens)-n+1):
|
| 455 |
ngram = ' '.join(tokens[i:i+n])
|
| 456 |
|
|
|
|
| 539 |
# return all the patterns that were captured
|
| 540 |
comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
|
| 541 |
|
| 542 |
+
# since those different techniques might capture similar patterns, we keep only unique references. More precisely
|
| 543 |
+
# we discard any unique reference while also any reference thay may exist as a substring on any other reference
|
| 544 |
|
| 545 |
+
# sort the list by length of the comparatives, in descending order
|
| 546 |
+
comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False)
|
| 547 |
+
|
| 548 |
+
unique_comparatives = {}
|
| 549 |
+
for i, item in enumerate(comparatives):
|
| 550 |
+
comparative = item['comparative'][0]
|
| 551 |
+
# check if the comparative is already in the dictionary or a substring/similar string of an existing comparative
|
| 552 |
+
is_unique = True
|
| 553 |
+
for existing_comp in unique_comparatives:
|
| 554 |
+
if (comparative in existing_comp) or (existing_comp in comparative):
|
| 555 |
+
is_unique = False
|
| 556 |
+
break
|
| 557 |
+
if is_unique:
|
| 558 |
+
unique_comparatives[comparative] = item
|
| 559 |
+
elif i == len(comparatives) - 1:
|
| 560 |
+
# if it's the last item and it's not unique, replace the first unique item in the list with this item
|
| 561 |
+
for j, existing_item in enumerate(unique_comparatives.values()):
|
| 562 |
+
if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]):
|
| 563 |
+
unique_comparatives.pop(list(unique_comparatives.keys())[j])
|
| 564 |
+
unique_comparatives[comparative] = item
|
| 565 |
+
break
|
| 566 |
|
| 567 |
unique_output = list(unique_comparatives.values())
|
| 568 |
|
| 569 |
return unique_output
|
| 570 |
|
| 571 |
|
|
|
|
| 572 |
def comparatives_binding(sentence):
|
| 573 |
|
| 574 |
try:
|
| 575 |
comparative_symbols = find_comptives_symbols(sentence)
|
| 576 |
+
|
| 577 |
comparative_mentions = identify_comparatives(sentence)
|
| 578 |
|
| 579 |
# starting with the symbols, if one was captured
|