mopac-library / src /search_engine /acronym_finder /acronym_finder_function.py
denovoref
Add binary files from src/search_engine/index
efeacc7
import re
import json
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
stop_word_list = ['&','and','or','the','of','to','in','on','at','for','-']
acronym_regex = r"([A-Z][\w,β€™β€˜']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,β€™β€˜']+){1,})\s\(([A-Za-z\s]+)\)"
def similar(a, b):
'''Measure similarity between two strings'''
return SequenceMatcher(None, a, b).ratio()
def quick_acronym_checker(acronym:str):
'''
Quickly checks if an acronym is a suitable candidate to check thoroughly
'''
# Need 2 or more characters
if not len(acronym) > 1:
return False
# Can't have more than 1 space
if acronym.count(' ') > 1:
return False
return True
def acronym_checker(definition: str, acronym: str, stop_word_list: list) -> tuple:
'''
Takes an acronym, and its associated definition candidate, and determines a similarity score
based on how likely the definition is to be related to the acronym. A score of 1 would imply
a perfect match. It's essentially a proportion of the letters in the acronym which exist as capital
letters in the definition (in reverse order, going right to left).
stop_word_list is passed through to determine stop words which exist in the definition (like "and") but not the acronym
'''
# Remove extra spaces and new lines
definition = re.sub(r"\n"," ",definition)
definition = re.sub(r"\s{2,}"," ",definition)
# Extract uppercase letters from the acronym
acronym_uppercase = ''.join(char for char in acronym if char.isupper()).replace('&', '').strip()
acronym_reversed = acronym_uppercase[::-1]
# Split the definition into words, removing unwanted characters
words = re.split(r'(?<=[^\s])-(?=[^\s])| ', definition)
# Remove possible plural at the last word
words[-1] = re.sub(r"(?:’|'|`)s$","",words[-1])
cleaned_words = [word.strip(" '\"`β€™β€˜") for word in words]
filtered_words = [word for word in cleaned_words if word not in stop_word_list]
# Step 1: Pair each word with its original index
words_with_indices = [(index, word) for index, word in enumerate(cleaned_words)]
# Step 2: Filter out the stop words but keep the original index
filtered_words_with_index = [(index, word) for index, word in words_with_indices if word not in stop_word_list]
# Generate capital letters from filtered words
capital_letters = ''.join(word[0] for word in filtered_words).strip()
# Calculate similarity measure
similarity_score = similar(acronym_uppercase, capital_letters)
if similarity_score < 1:
final_index = len(words)
# Handle case when similarity is not perfect
if len(acronym_reversed) > len(filtered_words):
return acronym, definition, similarity_score
# Find matches in reversed order
for char in acronym_reversed:
for index, word in filtered_words_with_index[::-1]:
if char == word[0].upper():
final_index = index
break
# Get the final matched definition
final_definition = words[final_index:]
final_filtered_words = [word for word in final_definition if word not in stop_word_list]
final_capital_letters = ''.join(word[0] for word in final_filtered_words).strip()
return acronym, ' '.join(final_definition), similar(acronym_uppercase, final_capital_letters)
return acronym, definition, similarity_score
def acronym_dict_generator(text:str,acronym_regex:str) -> dict:
'''Takes a block of text and searches for acronyms. Outputs a dictionary '''
acr_matches = re.findall(acronym_regex,text)
data = []
for g in acr_matches:
uppers = ''.join([char for char in g[1] if char.isupper()])
if len(uppers) > 1 and quick_acronym_checker(g[1]):
data.append(acronym_checker(g[0],g[1],stop_word_list))
acronym_dict = {}
data = list(set(data))
# Populate the dictionary
for acronym, definition, score in data:
if score > 0.5:
if acronym not in acronym_dict:
acronym_dict[acronym] = []
acronym_dict[acronym].append((definition, score))
# Sort each list of tuples by score in descending order
for acronym in acronym_dict:
acronym_dict[acronym].sort(key=lambda x: x[1], reverse=True)
return(acronym_dict)
def group_and_select_best(acronym_dict:dict, threshold=80) -> dict:
'''Groups acronym definitions based on string similarity (for example, "Borough Command Units" would be
grouped with "Borough Command Unit") and select the one with the best score. This outputs a dictionary
with keys as acronyms, and values as lists of tuples.
'''
result = {}
for acronym, definitions in acronym_dict.items():
grouped_definitions = [] # List to hold groups of similar definitions
# Sort definitions by score in descending order to prioritize higher-scoring definitions
definitions.sort(key=lambda x: x[1], reverse=True)
# Iterate through each definition and try to group them
for definition, score in definitions:
found_group = False
for group in grouped_definitions:
# Check if the definition is similar to any of the existing groups (first item of each group)
if fuzz.ratio(definition, group[0][0]) >= threshold:
group.append((definition, score)) # Add to this group if similar
found_group = True
break
if not found_group:
# If no matching group was found, create a new group with this definition
grouped_definitions.append([(definition, score)])
# After grouping, select the definition with the highest score from each group
best_definitions = [max(group, key=lambda x: x[1]) for group in grouped_definitions]
result[acronym] = best_definitions
return result