Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| from difflib import SequenceMatcher | |
| from fuzzywuzzy import fuzz | |
| stop_word_list = ['&','and','or','the','of','to','in','on','at','for','-'] | |
| acronym_regex = r"([A-Z][\w,ββ']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,ββ']+){1,})\s\(([A-Za-z\s]+)\)" | |
| def similar(a, b): | |
| '''Measure similarity between two strings''' | |
| return SequenceMatcher(None, a, b).ratio() | |
| def quick_acronym_checker(acronym:str): | |
| ''' | |
| Quickly checks if an acronym is a suitable candidate to check thoroughly | |
| ''' | |
| # Need 2 or more characters | |
| if not len(acronym) > 1: | |
| return False | |
| # Can't have more than 1 space | |
| if acronym.count(' ') > 1: | |
| return False | |
| return True | |
| def acronym_checker(definition: str, acronym: str, stop_word_list: list) -> tuple: | |
| ''' | |
| Takes an acronym, and its associated definition candidate, and determines a similarity score | |
| based on how likely the definition is to be related to the acronym. A score of 1 would imply | |
| a perfect match. It's essentially a proportion of the letters in the acronym which exist as capital | |
| letters in the definition (in reverse order, going right to left). | |
| stop_word_list is passed through to determine stop words which exist in the definition (like "and") but not the acronym | |
| ''' | |
| # Remove extra spaces and new lines | |
| definition = re.sub(r"\n"," ",definition) | |
| definition = re.sub(r"\s{2,}"," ",definition) | |
| # Extract uppercase letters from the acronym | |
| acronym_uppercase = ''.join(char for char in acronym if char.isupper()).replace('&', '').strip() | |
| acronym_reversed = acronym_uppercase[::-1] | |
| # Split the definition into words, removing unwanted characters | |
| words = re.split(r'(?<=[^\s])-(?=[^\s])| ', definition) | |
| # Remove possible plural at the last word | |
| words[-1] = re.sub(r"(?:β|'|`)s$","",words[-1]) | |
| cleaned_words = [word.strip(" '\"`ββ") for word in words] | |
| filtered_words = [word for word in cleaned_words if word not in stop_word_list] | |
| # Step 1: Pair each word with its original index | |
| words_with_indices = [(index, word) for index, word in enumerate(cleaned_words)] | |
| # Step 2: Filter out the stop words but keep the original index | |
| filtered_words_with_index = [(index, word) for index, word in words_with_indices if word not in stop_word_list] | |
| # Generate capital letters from filtered words | |
| capital_letters = ''.join(word[0] for word in filtered_words).strip() | |
| # Calculate similarity measure | |
| similarity_score = similar(acronym_uppercase, capital_letters) | |
| if similarity_score < 1: | |
| final_index = len(words) | |
| # Handle case when similarity is not perfect | |
| if len(acronym_reversed) > len(filtered_words): | |
| return acronym, definition, similarity_score | |
| # Find matches in reversed order | |
| for char in acronym_reversed: | |
| for index, word in filtered_words_with_index[::-1]: | |
| if char == word[0].upper(): | |
| final_index = index | |
| break | |
| # Get the final matched definition | |
| final_definition = words[final_index:] | |
| final_filtered_words = [word for word in final_definition if word not in stop_word_list] | |
| final_capital_letters = ''.join(word[0] for word in final_filtered_words).strip() | |
| return acronym, ' '.join(final_definition), similar(acronym_uppercase, final_capital_letters) | |
| return acronym, definition, similarity_score | |
| def acronym_dict_generator(text:str,acronym_regex:str) -> dict: | |
| '''Takes a block of text and searches for acronyms. Outputs a dictionary ''' | |
| acr_matches = re.findall(acronym_regex,text) | |
| data = [] | |
| for g in acr_matches: | |
| uppers = ''.join([char for char in g[1] if char.isupper()]) | |
| if len(uppers) > 1 and quick_acronym_checker(g[1]): | |
| data.append(acronym_checker(g[0],g[1],stop_word_list)) | |
| acronym_dict = {} | |
| data = list(set(data)) | |
| # Populate the dictionary | |
| for acronym, definition, score in data: | |
| if score > 0.5: | |
| if acronym not in acronym_dict: | |
| acronym_dict[acronym] = [] | |
| acronym_dict[acronym].append((definition, score)) | |
| # Sort each list of tuples by score in descending order | |
| for acronym in acronym_dict: | |
| acronym_dict[acronym].sort(key=lambda x: x[1], reverse=True) | |
| return(acronym_dict) | |
| def group_and_select_best(acronym_dict:dict, threshold=80) -> dict: | |
| '''Groups acronym definitions based on string similarity (for example, "Borough Command Units" would be | |
| grouped with "Borough Command Unit") and select the one with the best score. This outputs a dictionary | |
| with keys as acronyms, and values as lists of tuples. | |
| ''' | |
| result = {} | |
| for acronym, definitions in acronym_dict.items(): | |
| grouped_definitions = [] # List to hold groups of similar definitions | |
| # Sort definitions by score in descending order to prioritize higher-scoring definitions | |
| definitions.sort(key=lambda x: x[1], reverse=True) | |
| # Iterate through each definition and try to group them | |
| for definition, score in definitions: | |
| found_group = False | |
| for group in grouped_definitions: | |
| # Check if the definition is similar to any of the existing groups (first item of each group) | |
| if fuzz.ratio(definition, group[0][0]) >= threshold: | |
| group.append((definition, score)) # Add to this group if similar | |
| found_group = True | |
| break | |
| if not found_group: | |
| # If no matching group was found, create a new group with this definition | |
| grouped_definitions.append([(definition, score)]) | |
| # After grouping, select the definition with the highest score from each group | |
| best_definitions = [max(group, key=lambda x: x[1]) for group in grouped_definitions] | |
| result[acronym] = best_definitions | |
| return result |