Spaces:

AliMustapha
/

Geo-GenderStudy

Runtime error

App Files Files Community

AliMustapha commited on Aug 27, 2023

Commit

de491c8

1 Parent(s): 75b27db

add data_utils file

Browse files

Files changed (1) hide show

utils/data_utils.py +55 -0

utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from unidecode import unidecode
+import pandas as od
+import regex
+import unicodedata
+def is_most_common_char(s):
+    max_count = len(s) * 0.90  # calculate the maximum count of a single character
+    char_count = {}  # create an empty dictionary to store character counts
+    for c in s:
+        if not unicodedata.name(c, "") or not unicodedata.name(c).startswith('LATIN'):
+            return False  # return False if the character is not a Latin character
+        char_count[c] = char_count.get(c, 0) + 1  # increment the count of the character
+        if char_count[c] > max_count:  # if the count exceeds the maximum count
+            return True  # return True
+    return False  # return False if no Latin character appears more than MAX_COUNT% of the time
+def find_common_item(list_array):
+    result_array = [pair[0] for pair in list_array]
+    m_count = len(list(filter(lambda g: g==0, result_array)))
+    f_count = len(list(filter(lambda g: g==1, result_array)))
+    u_count = len(list(filter(lambda g: g==2, result_array)))
+    if u_count > max(m_count,f_count):
+        return 2
+    else:
+        if m_count > f_count:
+            return 0
+        elif f_count > m_count:
+            return 1
+        else:
+            return 2
+def is_roman_language(text):
+    roman_pattern = r'^\p{Latin}+$'
+    match = regex.match(roman_pattern, text, flags=regex.UNICODE)
+    return match is not None
+def text_to_romanize(text):
+    if not is_roman_language(text):
+        return unidecode(text)
+    else:
+        return text
+def is_alpha(s:str, min_alpha=0.60)->bool:
+    if len(s)==0:
+        return False
+    else:
+        alpha_chars=sum(
+            map(lambda c: 1 if unicodedata.category(c).startswith("L") or unicodedata.category(c)=="Zs"  else 0,s)
+        )
+        return alpha_chars/len(s) >=min_alpha
+def remove_spaces_from_ends(input_string):
+    return re.sub(r'^\s+|\s+$', '', input_string)