Spaces:

ryan-mopac
/

mopac-library

Sleeping

mopac-library / src /search_engine /acronym_finder /acronym_finder_function.py

denovoref

Add binary files from src/search_engine/index

efeacc7 over 1 year ago

6.08 kB

	import re
	import json
	from difflib import SequenceMatcher
	from fuzzywuzzy import fuzz

	stop_word_list = ['&','and','or','the','of','to','in','on','at','for','-']

	acronym_regex = r"([A-Z][\w,’‘']+(?:(?:\s\|&\|and\|or\|the\|of\|to\|in\|on\|at\|for\|an\|-)+[A-Z][\w,’‘']+){1,})\s$([A-Za-z\s]+)$"

	def similar(a, b):
	'''Measure similarity between two strings'''
	return SequenceMatcher(None, a, b).ratio()

	def quick_acronym_checker(acronym:str):
	'''
	Quickly checks if an acronym is a suitable candidate to check thoroughly
	'''
	# Need 2 or more characters
	if not len(acronym) > 1:
	return False
	# Can't have more than 1 space
	if acronym.count(' ') > 1:
	return False
	return True

	def acronym_checker(definition: str, acronym: str, stop_word_list: list) -> tuple:
	'''
	Takes an acronym, and its associated definition candidate, and determines a similarity score
	based on how likely the definition is to be related to the acronym. A score of 1 would imply
	a perfect match. It's essentially a proportion of the letters in the acronym which exist as capital
	letters in the definition (in reverse order, going right to left).

	stop_word_list is passed through to determine stop words which exist in the definition (like "and") but not the acronym
	'''
	# Remove extra spaces and new lines
	definition = re.sub(r"\n"," ",definition)
	definition = re.sub(r"\s{2,}"," ",definition)

	# Extract uppercase letters from the acronym

	acronym_uppercase = ''.join(char for char in acronym if char.isupper()).replace('&', '').strip()
	acronym_reversed = acronym_uppercase[::-1]

	# Split the definition into words, removing unwanted characters
	words = re.split(r'(?<=[^\s])-(?=[^\s])\| ', definition)

	# Remove possible plural at the last word
	words[-1] = re.sub(r"(?:’\|'\|`)s$","",words[-1])

	cleaned_words = [word.strip(" '\"`’‘") for word in words]
	filtered_words = [word for word in cleaned_words if word not in stop_word_list]

	# Step 1: Pair each word with its original index
	words_with_indices = [(index, word) for index, word in enumerate(cleaned_words)]

	# Step 2: Filter out the stop words but keep the original index
	filtered_words_with_index = [(index, word) for index, word in words_with_indices if word not in stop_word_list]

	# Generate capital letters from filtered words

	capital_letters = ''.join(word[0] for word in filtered_words).strip()

	# Calculate similarity measure
	similarity_score = similar(acronym_uppercase, capital_letters)

	if similarity_score < 1:
	final_index = len(words)
	# Handle case when similarity is not perfect
	if len(acronym_reversed) > len(filtered_words):
	return acronym, definition, similarity_score

	# Find matches in reversed order
	for char in acronym_reversed:
	for index, word in filtered_words_with_index[::-1]:
	if char == word[0].upper():
	final_index = index
	break

	# Get the final matched definition
	final_definition = words[final_index:]
	final_filtered_words = [word for word in final_definition if word not in stop_word_list]
	final_capital_letters = ''.join(word[0] for word in final_filtered_words).strip()

	return acronym, ' '.join(final_definition), similar(acronym_uppercase, final_capital_letters)

	return acronym, definition, similarity_score

	def acronym_dict_generator(text:str,acronym_regex:str) -> dict:
	'''Takes a block of text and searches for acronyms. Outputs a dictionary '''

	acr_matches = re.findall(acronym_regex,text)
	data = []
	for g in acr_matches:
	uppers = ''.join([char for char in g[1] if char.isupper()])
	if len(uppers) > 1 and quick_acronym_checker(g[1]):
	data.append(acronym_checker(g[0],g[1],stop_word_list))

	acronym_dict = {}
	data = list(set(data))
	# Populate the dictionary
	for acronym, definition, score in data:
	if score > 0.5:
	if acronym not in acronym_dict:
	acronym_dict[acronym] = []
	acronym_dict[acronym].append((definition, score))

	# Sort each list of tuples by score in descending order
	for acronym in acronym_dict:
	acronym_dict[acronym].sort(key=lambda x: x[1], reverse=True)

	return(acronym_dict)

	def group_and_select_best(acronym_dict:dict, threshold=80) -> dict:
	'''Groups acronym definitions based on string similarity (for example, "Borough Command Units" would be
	grouped with "Borough Command Unit") and select the one with the best score. This outputs a dictionary
	with keys as acronyms, and values as lists of tuples.
	'''
	result = {}

	for acronym, definitions in acronym_dict.items():
	grouped_definitions = [] # List to hold groups of similar definitions

	# Sort definitions by score in descending order to prioritize higher-scoring definitions
	definitions.sort(key=lambda x: x[1], reverse=True)

	# Iterate through each definition and try to group them
	for definition, score in definitions:
	found_group = False
	for group in grouped_definitions:
	# Check if the definition is similar to any of the existing groups (first item of each group)
	if fuzz.ratio(definition, group[0][0]) >= threshold:
	group.append((definition, score)) # Add to this group if similar
	found_group = True
	break

	if not found_group:
	# If no matching group was found, create a new group with this definition
	grouped_definitions.append([(definition, score)])

	# After grouping, select the definition with the highest score from each group
	best_definitions = [max(group, key=lambda x: x[1]) for group in grouped_definitions]
	result[acronym] = best_definitions

	return result