import re
import json
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz

stop_word_list = ['&','and','or','the','of','to','in','on','at','for','-']

acronym_regex = r"([A-Z][\w,’‘']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,’‘']+){1,})\s\(([A-Za-z\s]+)\)"

def similar(a, b):
    '''Measure similarity between two strings'''
    return SequenceMatcher(None, a, b).ratio()

def quick_acronym_checker(acronym:str):
    '''
    Quickly checks if an acronym is a suitable candidate to check thoroughly
    '''
    # Need 2 or more characters
    if not len(acronym) > 1:
        return False
    # Can't have more than 1 space
    if acronym.count(' ') > 1:
        return False
    return True

def acronym_checker(definition: str, acronym: str, stop_word_list: list) -> tuple:
    '''
    Takes an acronym, and its associated definition candidate, and determines a similarity score
    based on how likely the definition is to be related to the acronym. A score of 1 would imply
    a perfect match. It's essentially a proportion of the letters in the acronym which exist as capital 
    letters in the definition (in reverse order, going right to left).

    stop_word_list is passed through to determine stop words which exist in the definition (like "and") but not the acronym
    '''
    # Remove extra spaces and new lines
    definition = re.sub(r"\n"," ",definition)
    definition = re.sub(r"\s{2,}"," ",definition)

    # Extract uppercase letters from the acronym
    
    acronym_uppercase = ''.join(char for char in acronym if char.isupper()).replace('&', '').strip()
    acronym_reversed = acronym_uppercase[::-1]

    # Split the definition into words, removing unwanted characters
    words = re.split(r'(?<=[^\s])-(?=[^\s])| ', definition)

    # Remove possible plural at the last word
    words[-1] = re.sub(r"(?:’|'|`)s$","",words[-1])

    cleaned_words = [word.strip(" '\"`’‘") for word in words]
    filtered_words = [word for word in cleaned_words if word not in stop_word_list]

    # Step 1: Pair each word with its original index
    words_with_indices = [(index, word) for index, word in enumerate(cleaned_words)]

    # Step 2: Filter out the stop words but keep the original index
    filtered_words_with_index = [(index, word) for index, word in words_with_indices if word not in stop_word_list]

    # Generate capital letters from filtered words

    capital_letters = ''.join(word[0] for word in filtered_words).strip()

    # Calculate similarity measure
    similarity_score = similar(acronym_uppercase, capital_letters)

    if similarity_score < 1:
        final_index = len(words)
        # Handle case when similarity is not perfect
        if len(acronym_reversed) > len(filtered_words):
            return acronym, definition, similarity_score

        # Find matches in reversed order
        for char in acronym_reversed:
            for index, word in filtered_words_with_index[::-1]:
                if char == word[0].upper():
                    final_index = index
                    break

        # Get the final matched definition
        final_definition = words[final_index:]
        final_filtered_words = [word for word in final_definition if word not in stop_word_list]
        final_capital_letters = ''.join(word[0] for word in final_filtered_words).strip()
        
        return acronym, ' '.join(final_definition), similar(acronym_uppercase, final_capital_letters)

    return acronym, definition, similarity_score

def acronym_dict_generator(text:str,acronym_regex:str) -> dict:
    '''Takes a block of text and searches for acronyms. Outputs a dictionary '''

    acr_matches = re.findall(acronym_regex,text)
    data = []
    for g in acr_matches:
        uppers = ''.join([char for char in g[1] if char.isupper()])
        if len(uppers) > 1 and quick_acronym_checker(g[1]):
            data.append(acronym_checker(g[0],g[1],stop_word_list))

    acronym_dict = {}
    data = list(set(data))
    # Populate the dictionary
    for acronym, definition, score in data:
        if score > 0.5:
            if acronym not in acronym_dict:
                acronym_dict[acronym] = []
            acronym_dict[acronym].append((definition, score))

    # Sort each list of tuples by score in descending order
    for acronym in acronym_dict:
        acronym_dict[acronym].sort(key=lambda x: x[1], reverse=True)

    return(acronym_dict)

def group_and_select_best(acronym_dict:dict, threshold=80) -> dict:
    '''Groups acronym definitions based on string similarity (for example, "Borough Command Units" would be
    grouped with "Borough Command Unit") and select the one with the best score. This outputs a dictionary
    with keys as acronyms, and values as lists of tuples.
    '''
    result = {}

    for acronym, definitions in acronym_dict.items():
        grouped_definitions = []  # List to hold groups of similar definitions
        
        # Sort definitions by score in descending order to prioritize higher-scoring definitions
        definitions.sort(key=lambda x: x[1], reverse=True)
        
        # Iterate through each definition and try to group them
        for definition, score in definitions:
            found_group = False
            for group in grouped_definitions:
                # Check if the definition is similar to any of the existing groups (first item of each group)
                if fuzz.ratio(definition, group[0][0]) >= threshold:
                    group.append((definition, score))  # Add to this group if similar
                    found_group = True
                    break
            
            if not found_group:
                # If no matching group was found, create a new group with this definition
                grouped_definitions.append([(definition, score)])
        
        # After grouping, select the definition with the highest score from each group
        best_definitions = [max(group, key=lambda x: x[1]) for group in grouped_definitions]
        result[acronym] = best_definitions

    return result