File size: 2,659 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | import re
try:
import pyphen
_hyph_pt_br = pyphen.Pyphen(lang='pt_BR')
_hyph_pt_pt = pyphen.Pyphen(lang='pt_PT')
except Exception:
_hyph_pt_br = _hyph_pt_pt = None
# --- Tokenization ---
WORD_RE_PT = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]+", re.UNICODE) # includes áâãà ç éê í óôõ ú ü etc.
def tokenize_words_pt(text: str):
return WORD_RE_PT.findall(text)
def count_sentences_pt(text: str):
# Keep it simple: ., !, ?, … as boundaries
parts = re.split(r"[.!?…]+", text)
return max(1, sum(1 for p in parts if p.strip()))
def count_syllables_pt(word: str) -> int:
# Prefer hyphenation dictionaries (pt_BR first, then pt_PT)
if _hyph_pt_br or _hyph_pt_pt:
hyph = (_hyph_pt_br or _hyph_pt_pt).inserted(word)
return max(1, hyph.count('-') + 1)
# Fallback: vowel-group heuristic (rough)
groups = re.findall(r"[aeiouyAEIOUYàáâãéêíóôõúüÀÁÂÃÉÊÍÓÔÕÚÜ]+", word)
return max(1, len(groups))
# --- Flesch Reading Ease (Portuguese adaptation) ---
def flesch_portuguese(text: str):
words = tokenize_words_pt(text)
W = len(words)
if W == 0:
return None
S = count_sentences_pt(text)
syl = sum(count_syllables_pt(w) for w in words)
F = W / S # words per sentence
P = syl / W # syllables per word
score = 248.835 - 1.015 * F - 84.6 * P
return round(score, 2)
# --- LIX / RIX ---
def lix(text: str):
words = tokenize_words_pt(text)
W = len(words)
if W == 0:
return None
S = count_sentences_pt(text)
long_words = sum(1 for w in words if len(w) > 6)
return round((W / S) + (100.0 * long_words / W), 2)
def rix(text: str):
words = tokenize_words_pt(text)
W = len(words)
if W == 0:
return None
S = count_sentences_pt(text)
long_words = sum(1 for w in words if len(w) > 6)
return round(long_words / S, 2)
# --- Band checks ---
FRE_PT_BANDS = {
'B1': (70, 100),
'B2': (60, 70),
'B3': (45, 60),
}
LIX_BANDS = {
'B1': (20, 35),
'B2': (35, 45),
'B3': (45, 60),
}
def in_band(score, band, bands, delta=0.0):
if score is None:
return False
lo, hi = bands[band]
return (lo - delta) <= score <= (hi + delta)
# Example
if __name__ == "__main__":
txt = "O paciente está bem. Os exames não mostram sinais de infecção. Recomenda-se apenas acompanhamento."
fre = flesch_portuguese(txt)
lx = lix(txt)
rx = rix(txt)
print("FRE-PT:", fre, "B1?", in_band(fre, 'B1', FRE_PT_BANDS, delta=1.0))
print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2.0))
print("RIX:", rx) |