Spaces:

AliMustapha
/

Geo-GenderStudy

Runtime error

App Files Files Community

Geo-GenderStudy / utils /data_utils.py

AliMustapha

add re

9ec8f41 over 2 years ago

raw

history blame contribute delete

2.03 kB

	__copyright__ = "Copyright (C) 2023 Ali Mustapha"
	__license__ = "GPL-3.0-or-later"

	from unidecode import unidecode
	import pandas as od
	import regex
	import unicodedata
	import re
	def is_most_common_char(s):
	max_count = len(s) * 0.90 # calculate the maximum count of a single character
	char_count = {} # create an empty dictionary to store character counts
	for c in s:
	if not unicodedata.name(c, "") or not unicodedata.name(c).startswith('LATIN'):
	return False # return False if the character is not a Latin character
	char_count[c] = char_count.get(c, 0) + 1 # increment the count of the character
	if char_count[c] > max_count: # if the count exceeds the maximum count
	return True # return True
	return False # return False if no Latin character appears more than MAX_COUNT% of the time

	def find_common_item(list_array):
	result_array = [pair[0] for pair in list_array]

	m_count = len(list(filter(lambda g: g==0, result_array)))
	f_count = len(list(filter(lambda g: g==1, result_array)))
	u_count = len(list(filter(lambda g: g==2, result_array)))
	if u_count > max(m_count,f_count):
	return 2
	else:
	if m_count > f_count:
	return 0
	elif f_count > m_count:
	return 1

	else:
	return 2

	def is_roman_language(text):
	roman_pattern = r'^\p{Latin}+$'
	match = regex.match(roman_pattern, text, flags=regex.UNICODE)
	return match is not None

	def text_to_romanize(text):
	if not is_roman_language(text):
	return unidecode(text)
	else:
	return text

	def is_alpha(s:str, min_alpha=0.60)->bool:
	if len(s)==0:
	return False
	else:
	alpha_chars=sum(
	map(lambda c: 1 if unicodedata.category(c).startswith("L") or unicodedata.category(c)=="Zs" else 0,s)
	)
	return alpha_chars/len(s) >=min_alpha

	def remove_spaces_from_ends(input_string):
	return re.sub(r'^\s+\|\s+$', '', input_string)