mshahidul

Initial commit of readCtrl code without large models

030876e 7 days ago

1.92 kB

	import re
	import pyphen

	# --- Basic Spanish text stats ---
	_dic = pyphen.Pyphen(lang='es_ES')

	_word_re = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)

	def _tokenize_words(text):
	return _word_re.findall(text)

	def _count_sentences(text):
	# Split on ., !, ?, and Spanish ¡¿ — keep it simple
	parts = re.split(r"[.!?¡¿]+", text)
	return max(1, sum(1 for p in parts if p.strip()))

	def _count_syllables_es(word):
	parts = _dic.hyphenate(word)
	return (len(parts) + 1) if parts else 1

	def _text_stats_es(text):
	words = _tokenize_words(text)
	W = len(words)
	S = _count_sentences(text)
	syl = sum(_count_syllables_es(w) for w in words) if W else 0
	LW = sum(1 for w in words if len(w) > 6) # LIX long words (>6 chars)
	return W, S, syl, LW

	# --- Szigriszt–Pazos (INFLESZ) ---
	def szigriszt_pazos(text):
	W, S, syl, _ = _text_stats_es(text)
	if W == 0 or S == 0:
	return None
	# Reading ease: higher = easier
	return 206.835 - 62.3 * (syl / W) - (W / S)

	# --- LIX (language-agnostic) ---
	def lix(text):
	W, S, _, LW = _text_stats_es(text)
	if W == 0 or S == 0:
	return None
	return (W / S) + (100.0 * LW / W)

	# Example bands (tune to your corpus)
	SZ_BANDS = {
	'B1': (65, 100), # easy to very easy
	'B2': (55, 65), # normal
	'B3': (40, 55), # somewhat hard
	}

	LIX_BANDS = {
	'B1': (20, 35), # easier
	'B2': (35, 45), # mid
	'B3': (45, 60), # harder
	}

	def in_band(score, band, bands, delta=0.0):
	if score is None:
	return False
	lo, hi = bands[band]
	return (lo - delta) <= score <= (hi + delta)

	# Example usage
	text = "Las vacunas salvan millones de vidas cada año. Son seguras y eficaces."
	sz = szigriszt_pazos(text)
	lx = lix(text)
	# print("Szigriszt:", sz, "B1?", in_band(sz, 'B1', SZ_BANDS, delta=2))
	# print("LIX:", lx, "B1?", in_band(lx, 'B1', LIX_BANDS, delta=2))