__copyright__ = "Copyright (C) 2023 Ali Mustapha" __license__ = "GPL-3.0-or-later" from unidecode import unidecode import pandas as od import regex import unicodedata import re def is_most_common_char(s): max_count = len(s) * 0.90 # calculate the maximum count of a single character char_count = {} # create an empty dictionary to store character counts for c in s: if not unicodedata.name(c, "") or not unicodedata.name(c).startswith('LATIN'): return False # return False if the character is not a Latin character char_count[c] = char_count.get(c, 0) + 1 # increment the count of the character if char_count[c] > max_count: # if the count exceeds the maximum count return True # return True return False # return False if no Latin character appears more than MAX_COUNT% of the time def find_common_item(list_array): result_array = [pair[0] for pair in list_array] m_count = len(list(filter(lambda g: g==0, result_array))) f_count = len(list(filter(lambda g: g==1, result_array))) u_count = len(list(filter(lambda g: g==2, result_array))) if u_count > max(m_count,f_count): return 2 else: if m_count > f_count: return 0 elif f_count > m_count: return 1 else: return 2 def is_roman_language(text): roman_pattern = r'^\p{Latin}+$' match = regex.match(roman_pattern, text, flags=regex.UNICODE) return match is not None def text_to_romanize(text): if not is_roman_language(text): return unidecode(text) else: return text def is_alpha(s:str, min_alpha=0.60)->bool: if len(s)==0: return False else: alpha_chars=sum( map(lambda c: 1 if unicodedata.category(c).startswith("L") or unicodedata.category(c)=="Zs" else 0,s) ) return alpha_chars/len(s) >=min_alpha def remove_spaces_from_ends(input_string): return re.sub(r'^\s+|\s+$', '', input_string)