readctrl / code /old /FH_es.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import re
# --- Spanish tokenization ---
WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
def _tokenize_words_es(text: str):
return WORD_RE.findall(text)
def _count_sentences_es(text: str) -> int:
# Count sentences via ., !, ?, … and Spanish ¡¿
sentences = re.split(r"[.!?…]+|[¡¿]", text)
return max(1, sum(1 for s in sentences if s.strip()))
# --- Syllable counting ---
try:
import pyphen
_dic = pyphen.Pyphen(lang='es') # or 'es_ES'
def count_syllables_es(word: str) -> int:
# Use hyphenation positions; count pieces
hyph = _dic.inserted(word)
return max(1, hyph.count('-') + 1)
except Exception:
# Heuristic fallback (handles hiatus and silent 'u' roughly)
def count_syllables_es(word: str) -> int:
w = word.lower()
# Treat final 'y' as vowel 'i'
w = re.sub(r'y$', 'i', w)
# Remove silent 'u' before e/i in 'que/qui/gue/gui' (but not 'güe/güi')
w = re.sub(r'que', 'qe', w)
w = re.sub(r'qui', 'qi', w)
w = re.sub(r'gue', 'ge', w)
w = re.sub(r'gui', 'gi', w)
vowels = set("aeiouáéíóúü")
strong = set("aáeéoóíú") # accented í/ú behave like strong (hiatus)
n = len(w)
i = 0
syll = 0
while i < n:
if w[i] not in vowels:
i += 1
continue
# collect contiguous vowels
j = i + 1
while j < n and w[j] in vowels:
j += 1
seq = w[i:j]
# one nucleus by default
nuclei = 1
# split on strong-strong boundaries (ae, ea, ao, oa, eo, oe, and cases with í/ú)
for k in range(len(seq) - 1):
if seq[k] in strong and seq[k + 1] in strong:
nuclei += 1
syll += nuclei
i = j
return max(1, syll)
# --- Fernández–Huerta (FH) ---
def fernandez_huerta(text: str) -> float | None:
"""
Fernández–Huerta readability for Spanish.
Higher = easier. Typical range ~0–100.
"""
words = _tokenize_words_es(text)
n_words = len(words)
if n_words == 0:
return None
n_sentences = _count_sentences_es(text)
n_syllables = sum(count_syllables_es(w) for w in words)
# FH = 206.84 - 0.60 * (P) - 1.02 * (F)
# P = (syllables/words)*100, F = words/sentence
fh = 206.84 - 0.60 * ((n_syllables / n_words) * 100.0) - 1.02 * (n_words / n_sentences)
return round(fh, 2)
# --- Quick check ---
# if __name__ == "__main__":
# text_easy = "El corazón es un órgano que bombea sangre. En este caso, funciona bien."
# text_medium = "El corazón del paciente muestra una función adecuada, aunque se observaron pequeños cambios que deben revisarse."
# text_hard = "La evaluación cardiológica indicó una función sistólica preservada, con alteraciones discretas en la relajación diastólica."
# print("Easy FH:", fernandez_huerta(text_easy))
# print("Medium FH:", fernandez_huerta(text_medium))
# print("Hard FH:", fernandez_huerta(text_hard))