shahidul034
/

readctrl

Model card Files Files and versions

readctrl / code /old /FH_es.py

shahidul034's picture

Add files using upload-large-folder tool

1db7196 verified about 1 month ago

3.14 kB

	import re

	# --- Spanish tokenization ---
	WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)

	def _tokenize_words_es(text: str):
	return WORD_RE.findall(text)

	def _count_sentences_es(text: str) -> int:
	# Count sentences via ., !, ?, … and Spanish ¡¿
	sentences = re.split(r"[.!?…]+\|[¡¿]", text)
	return max(1, sum(1 for s in sentences if s.strip()))

	# --- Syllable counting ---
	try:
	import pyphen
	_dic = pyphen.Pyphen(lang='es') # or 'es_ES'

	def count_syllables_es(word: str) -> int:
	# Use hyphenation positions; count pieces
	hyph = _dic.inserted(word)
	return max(1, hyph.count('-') + 1)
	except Exception:
	# Heuristic fallback (handles hiatus and silent 'u' roughly)
	def count_syllables_es(word: str) -> int:
	w = word.lower()

	# Treat final 'y' as vowel 'i'
	w = re.sub(r'y$', 'i', w)

	# Remove silent 'u' before e/i in 'que/qui/gue/gui' (but not 'güe/güi')
	w = re.sub(r'que', 'qe', w)
	w = re.sub(r'qui', 'qi', w)
	w = re.sub(r'gue', 'ge', w)
	w = re.sub(r'gui', 'gi', w)

	vowels = set("aeiouáéíóúü")
	strong = set("aáeéoóíú") # accented í/ú behave like strong (hiatus)
	n = len(w)
	i = 0
	syll = 0
	while i < n:
	if w[i] not in vowels:
	i += 1
	continue
	# collect contiguous vowels
	j = i + 1
	while j < n and w[j] in vowels:
	j += 1
	seq = w[i:j]
	# one nucleus by default
	nuclei = 1
	# split on strong-strong boundaries (ae, ea, ao, oa, eo, oe, and cases with í/ú)
	for k in range(len(seq) - 1):
	if seq[k] in strong and seq[k + 1] in strong:
	nuclei += 1
	syll += nuclei
	i = j
	return max(1, syll)

	# --- Fernández–Huerta (FH) ---
	def fernandez_huerta(text: str) -> float \| None:
	"""
	Fernández–Huerta readability for Spanish.
	Higher = easier. Typical range ~0–100.
	"""
	words = _tokenize_words_es(text)
	n_words = len(words)
	if n_words == 0:
	return None
	n_sentences = _count_sentences_es(text)
	n_syllables = sum(count_syllables_es(w) for w in words)

	# FH = 206.84 - 0.60 * (P) - 1.02 * (F)
	# P = (syllables/words)*100, F = words/sentence
	fh = 206.84 - 0.60 * ((n_syllables / n_words) * 100.0) - 1.02 * (n_words / n_sentences)
	return round(fh, 2)

	# --- Quick check ---
	# if __name__ == "__main__":
	# text_easy = "El corazón es un órgano que bombea sangre. En este caso, funciona bien."
	# text_medium = "El corazón del paciente muestra una función adecuada, aunque se observaron pequeños cambios que deben revisarse."
	# text_hard = "La evaluación cardiológica indicó una función sistólica preservada, con alteraciones discretas en la relajación diastólica."
	# print("Easy FH:", fernandez_huerta(text_easy))
	# print("Medium FH:", fernandez_huerta(text_medium))
	# print("Hard FH:", fernandez_huerta(text_hard))