File size: 1,920 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import re
import pyphen
# --- Basic Spanish text stats ---
_dic = pyphen.Pyphen(lang='es_ES')
_word_re = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
def _tokenize_words(text):
return _word_re.findall(text)
def _count_sentences(text):
# Split on ., !, ?, and Spanish ¡¿ — keep it simple
parts = re.split(r"[.!?¡¿]+", text)
return max(1, sum(1 for p in parts if p.strip()))
def _count_syllables_es(word):
parts = _dic.hyphenate(word)
return (len(parts) + 1) if parts else 1
def _text_stats_es(text):
words = _tokenize_words(text)
W = len(words)
S = _count_sentences(text)
syl = sum(_count_syllables_es(w) for w in words) if W else 0
LW = sum(1 for w in words if len(w) > 6) # LIX long words (>6 chars)
return W, S, syl, LW
# --- Szigriszt–Pazos (INFLESZ) ---
def szigriszt_pazos(text):
W, S, syl, _ = _text_stats_es(text)
if W == 0 or S == 0:
return None
# Reading ease: higher = easier
return 206.835 - 62.3 * (syl / W) - (W / S)
# --- LIX (language-agnostic) ---
def lix(text):
W, S, _, LW = _text_stats_es(text)
if W == 0 or S == 0:
return None
return (W / S) + (100.0 * LW / W)
# Example bands (tune to your corpus)
SZ_BANDS = {
'B1': (65, 100), # easy to very easy
'B2': (55, 65), # normal
'B3': (40, 55), # somewhat hard
}
LIX_BANDS = {
'B1': (20, 35), # easier
'B2': (35, 45), # mid
'B3': (45, 60), # harder
}
def in_band(score, band, bands, delta=0.0):
if score is None:
return False
lo, hi = bands[band]
return (lo - delta) <= score <= (hi + delta)
# Example usage
text = "Las vacunas salvan millones de vidas cada año. Son seguras y eficaces."
sz = szigriszt_pazos(text)
lx = lix(text)
# print("Szigriszt:", sz, "B1?", in_band(sz, 'B1', SZ_BANDS, delta=2))
# print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2)) |