AliMustapha commited on
Commit
de491c8
·
1 Parent(s): 75b27db

add data_utils file

Browse files
Files changed (1) hide show
  1. utils/data_utils.py +55 -0
utils/data_utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from unidecode import unidecode
2
+ import pandas as od
3
+ import regex
4
+ import unicodedata
5
+
6
+ def is_most_common_char(s):
7
+ max_count = len(s) * 0.90 # calculate the maximum count of a single character
8
+ char_count = {} # create an empty dictionary to store character counts
9
+ for c in s:
10
+ if not unicodedata.name(c, "") or not unicodedata.name(c).startswith('LATIN'):
11
+ return False # return False if the character is not a Latin character
12
+ char_count[c] = char_count.get(c, 0) + 1 # increment the count of the character
13
+ if char_count[c] > max_count: # if the count exceeds the maximum count
14
+ return True # return True
15
+ return False # return False if no Latin character appears more than MAX_COUNT% of the time
16
+
17
+ def find_common_item(list_array):
18
+ result_array = [pair[0] for pair in list_array]
19
+
20
+ m_count = len(list(filter(lambda g: g==0, result_array)))
21
+ f_count = len(list(filter(lambda g: g==1, result_array)))
22
+ u_count = len(list(filter(lambda g: g==2, result_array)))
23
+ if u_count > max(m_count,f_count):
24
+ return 2
25
+ else:
26
+ if m_count > f_count:
27
+ return 0
28
+ elif f_count > m_count:
29
+ return 1
30
+
31
+ else:
32
+ return 2
33
+
34
+ def is_roman_language(text):
35
+ roman_pattern = r'^\p{Latin}+$'
36
+ match = regex.match(roman_pattern, text, flags=regex.UNICODE)
37
+ return match is not None
38
+
39
+ def text_to_romanize(text):
40
+ if not is_roman_language(text):
41
+ return unidecode(text)
42
+ else:
43
+ return text
44
+
45
+ def is_alpha(s:str, min_alpha=0.60)->bool:
46
+ if len(s)==0:
47
+ return False
48
+ else:
49
+ alpha_chars=sum(
50
+ map(lambda c: 1 if unicodedata.category(c).startswith("L") or unicodedata.category(c)=="Zs" else 0,s)
51
+ )
52
+ return alpha_chars/len(s) >=min_alpha
53
+
54
+ def remove_spaces_from_ends(input_string):
55
+ return re.sub(r'^\s+|\s+$', '', input_string)