import re import json from difflib import SequenceMatcher from fuzzywuzzy import fuzz stop_word_list = ['&','and','or','the','of','to','in','on','at','for','-'] acronym_regex = r"([A-Z][\w,’‘']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,’‘']+){1,})\s\(([A-Za-z\s]+)\)" def similar(a, b): '''Measure similarity between two strings''' return SequenceMatcher(None, a, b).ratio() def quick_acronym_checker(acronym:str): ''' Quickly checks if an acronym is a suitable candidate to check thoroughly ''' # Need 2 or more characters if not len(acronym) > 1: return False # Can't have more than 1 space if acronym.count(' ') > 1: return False return True def acronym_checker(definition: str, acronym: str, stop_word_list: list) -> tuple: ''' Takes an acronym, and its associated definition candidate, and determines a similarity score based on how likely the definition is to be related to the acronym. A score of 1 would imply a perfect match. It's essentially a proportion of the letters in the acronym which exist as capital letters in the definition (in reverse order, going right to left). stop_word_list is passed through to determine stop words which exist in the definition (like "and") but not the acronym ''' # Remove extra spaces and new lines definition = re.sub(r"\n"," ",definition) definition = re.sub(r"\s{2,}"," ",definition) # Extract uppercase letters from the acronym acronym_uppercase = ''.join(char for char in acronym if char.isupper()).replace('&', '').strip() acronym_reversed = acronym_uppercase[::-1] # Split the definition into words, removing unwanted characters words = re.split(r'(?<=[^\s])-(?=[^\s])| ', definition) # Remove possible plural at the last word words[-1] = re.sub(r"(?:’|'|`)s$","",words[-1]) cleaned_words = [word.strip(" '\"`’‘") for word in words] filtered_words = [word for word in cleaned_words if word not in stop_word_list] # Step 1: Pair each word with its original index words_with_indices = [(index, word) for index, word in enumerate(cleaned_words)] # Step 2: Filter out the stop words but keep the original index filtered_words_with_index = [(index, word) for index, word in words_with_indices if word not in stop_word_list] # Generate capital letters from filtered words capital_letters = ''.join(word[0] for word in filtered_words).strip() # Calculate similarity measure similarity_score = similar(acronym_uppercase, capital_letters) if similarity_score < 1: final_index = len(words) # Handle case when similarity is not perfect if len(acronym_reversed) > len(filtered_words): return acronym, definition, similarity_score # Find matches in reversed order for char in acronym_reversed: for index, word in filtered_words_with_index[::-1]: if char == word[0].upper(): final_index = index break # Get the final matched definition final_definition = words[final_index:] final_filtered_words = [word for word in final_definition if word not in stop_word_list] final_capital_letters = ''.join(word[0] for word in final_filtered_words).strip() return acronym, ' '.join(final_definition), similar(acronym_uppercase, final_capital_letters) return acronym, definition, similarity_score def acronym_dict_generator(text:str,acronym_regex:str) -> dict: '''Takes a block of text and searches for acronyms. Outputs a dictionary ''' acr_matches = re.findall(acronym_regex,text) data = [] for g in acr_matches: uppers = ''.join([char for char in g[1] if char.isupper()]) if len(uppers) > 1 and quick_acronym_checker(g[1]): data.append(acronym_checker(g[0],g[1],stop_word_list)) acronym_dict = {} data = list(set(data)) # Populate the dictionary for acronym, definition, score in data: if score > 0.5: if acronym not in acronym_dict: acronym_dict[acronym] = [] acronym_dict[acronym].append((definition, score)) # Sort each list of tuples by score in descending order for acronym in acronym_dict: acronym_dict[acronym].sort(key=lambda x: x[1], reverse=True) return(acronym_dict) def group_and_select_best(acronym_dict:dict, threshold=80) -> dict: '''Groups acronym definitions based on string similarity (for example, "Borough Command Units" would be grouped with "Borough Command Unit") and select the one with the best score. This outputs a dictionary with keys as acronyms, and values as lists of tuples. ''' result = {} for acronym, definitions in acronym_dict.items(): grouped_definitions = [] # List to hold groups of similar definitions # Sort definitions by score in descending order to prioritize higher-scoring definitions definitions.sort(key=lambda x: x[1], reverse=True) # Iterate through each definition and try to group them for definition, score in definitions: found_group = False for group in grouped_definitions: # Check if the definition is similar to any of the existing groups (first item of each group) if fuzz.ratio(definition, group[0][0]) >= threshold: group.append((definition, score)) # Add to this group if similar found_group = True break if not found_group: # If no matching group was found, create a new group with this definition grouped_definitions.append([(definition, score)]) # After grouping, select the definition with the highest score from each group best_definitions = [max(group, key=lambda x: x[1]) for group in grouped_definitions] result[acronym] = best_definitions return result