File size: 1,920 Bytes

1db7196

import re
import pyphen

# --- Basic Spanish text stats ---
_dic = pyphen.Pyphen(lang='es_ES')

_word_re = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)

def _tokenize_words(text):
    return _word_re.findall(text)

def _count_sentences(text):
    # Split on ., !, ?, and Spanish ¡¿ — keep it simple
    parts = re.split(r"[.!?¡¿]+", text)
    return max(1, sum(1 for p in parts if p.strip()))

def _count_syllables_es(word):
    parts = _dic.hyphenate(word)
    return (len(parts) + 1) if parts else 1

def _text_stats_es(text):
    words = _tokenize_words(text)
    W = len(words)
    S = _count_sentences(text)
    syl = sum(_count_syllables_es(w) for w in words) if W else 0
    LW = sum(1 for w in words if len(w) > 6)  # LIX long words (>6 chars)
    return W, S, syl, LW

# --- Szigriszt–Pazos (INFLESZ) ---
def szigriszt_pazos(text):
    W, S, syl, _ = _text_stats_es(text)
    if W == 0 or S == 0:
        return None
    # Reading ease: higher = easier
    return 206.835 - 62.3 * (syl / W) - (W / S)

# --- LIX (language-agnostic) ---
def lix(text):
    W, S, _, LW = _text_stats_es(text)
    if W == 0 or S == 0:
        return None
    return (W / S) + (100.0 * LW / W)

# Example bands (tune to your corpus)
SZ_BANDS = {
    'B1': (65, 100),  # easy to very easy
    'B2': (55, 65),   # normal
    'B3': (40, 55),   # somewhat hard
}

LIX_BANDS = {
    'B1': (20, 35),   # easier
    'B2': (35, 45),   # mid
    'B3': (45, 60),   # harder
}

def in_band(score, band, bands, delta=0.0):
    if score is None:
        return False
    lo, hi = bands[band]
    return (lo - delta) <= score <= (hi + delta)

# Example usage
text = "Las vacunas salvan millones de vidas cada año. Son seguras y eficaces."
sz = szigriszt_pazos(text)
lx = lix(text)
# print("Szigriszt:", sz, "B1?", in_band(sz, 'B1', SZ_BANDS, delta=2))
# print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2))