CO3 / src /modules /lexical.py
amaisto's picture
Upload 4 files
4fde834 verified
import re
import math
from collections import Counter
class LexicalModule:
def __init__(self, r1, r2, r3, mrc):
self.r1, self.r2, self.r3, self.mrc = r1, r2, r3, mrc
# --- Set INGLESI ---
self.demons_set_en = {"this", "that", "these", "those"}
self.deictics_set_en = {
"i", "me", "my", "mine", "we", "us", "our", "ours",
"you", "your", "yours", "he", "him", "his", "she",
"her", "hers", "it", "its", "they", "them", "their", "theirs",
"here", "there", "this", "these", "that", "those",
"yonder", "above", "below", "in front", "behind",
"now", "then", "today", "yesterday", "tomorrow", "tonight",
"last week", "last month", "last year", "next week",
"next month", "next year", "ago", "soon", "later", "currently",
"sir", "madam", "ma'am", "your honor", "professor", "doctor",
"the former", "the latter", "above-mentioned", "as follows", "hereby",
"this way", "that time", "these days", "those years",
"thus", "therefore", "hereby", "henceforth"
}
self.emphatics_en = {
"just", "even", "so", "really", "absolutely",
"quite", "indeed", "definitely", "certainly",
"rather", "pretty", "precisely", "actually",
"totally", "completely", "utterly", "extremely",
"basically", "simply", "literally", "honestly",
"frankly", "truly", "surely", "clearly", "obviously",
"perfectly", "exactly", "merely", "only", "still",
"yet", "already", "else", "ever", "never", "always",
"very", "much", "too", "enough", "well", "oh",
"wow", "hey", "like", "such", "right", "now",
"then", "further", "more", "less", "super",
"incredibly", "remarkably", "genuinely",
"positively", "unbelievably",
"extraordinarily", "immensely", "vastly",
"somewhat", "kind of", "sort of",
"pretty much", "practically", "essentially",
"virtually", "nearly", "almost", "about",
"roughly", "approximately", "potentially",
"arguably", "seemingly", "apparently",
"theoretically", "conceptually", "fundamentally",
"relatively", "comparatively", "distinctly",
"considerably", "significantly", "substantially",
"surprisingly", "unexpectedly", "oddly",
"strangely", "curiously", "ironically",
"paradoxically", "undoubtedly", "indisputably", "categorically"
}
self.first_pers_en = {"i", "me", "myself"}
self.articles_en = {"a", "an", "the", "some"}
self.definite_en = {"the"}
# --- Set ITALIANI ---
self.demons_set_it = {
"questo", "questa", "questi", "queste",
"quello", "quella", "quelli", "quelle",
"codesto", "codesta", "codesti", "codeste",
"ciò", "tal", "tale", "tali"
}
self.deictics_set_it = {
# Personali
"io", "me", "mi", "mio", "mia", "miei", "mie",
"noi", "ci", "nostro", "nostra", "nostri", "nostre",
"tu", "te", "ti", "tuo", "tua", "tuoi", "tue",
"voi", "vi", "vostro", "vostra", "vostri", "vostre",
"lui", "lo", "gli", "suo", "sua", "suoi", "sue",
"lei", "la", "loro", "li", "le",
"essi", "esse", "sé", "si",
# Spaziali
"qui", "qua", "là", "lì", "lassù", "laggiù",
"davanti", "dietro", "sopra", "sotto", "accanto",
"vicino", "lontano", "oltre", "presso",
# Temporali
"ora", "adesso", "oggi", "ieri", "domani", "stanotte",
"settimana scorsa", "mese scorso", "anno scorso",
"prossima settimana", "prossimo mese", "prossimo anno",
"presto", "tardi", "attualmente", "nel frattempo",
# Sociali
"signore", "signora", "professore", "dottore", "eccellenza",
# Discorso
"il suddetto", "il sottoscritto", "quanto sopra", "come segue",
# Dimostrativi avverbiali
"così", "pertanto", "quindi", "allora", "dunque"
}
self.emphatics_it = {
"molto", "tanto", "assai", "parecchio", "alquanto",
"enormemente", "immensamente", "incredibilmente", "straordinariamente",
"oltremodo", "sommamente", "estremamente", "tremendamente",
"proprio", "davvero", "veramente", "realmente", "effettivamente",
"concretamente", "precisamente", "esattamente", "letteralmente",
"certamente", "sicuramente", "assolutamente", "decisamente",
"indubbiamente", "senz'altro", "senza dubbio", "ovviamente",
"naturalmente", "chiaramente", "evidentemente",
"abbastanza", "piuttosto", "relativamente",
"praticamente", "quasi", "circa", "pressappoco", "sostanzialmente",
"fondamentalmente", "essenzialmente", "in sostanza", "in fondo",
"tutto sommato", "grosso modo", "più o meno",
"solo", "soltanto", "solamente", "appena", "persino", "perfino",
"addirittura", "almeno", "quanto meno", "perlomeno",
"già", "ancora", "mai", "sempre", "pure", "anche",
"insomma", "dopotutto", "in effetti", "anzi", "ecco",
"beh", "bene", "cioè"
}
self.first_pers_it = {"io", "me", "mi", "me stesso", "me stessa"}
# Articoli italiani (determinativi + indeterminativi comuni)
self.articles_it = {"il", "lo", "la", "i", "gli", "le", "un", "uno", "una", "un'"}
self.definite_it = {"il", "lo", "la", "i", "gli", "le"}
def analyze(self, doc, lemmas, tokens_text, token_text_SP, raw_text, sentences, paragraphs, lang="en"):
# Seleziona i set in base alla lingua
if lang == "it":
active_demons = self.demons_set_it
active_deictics = self.deictics_set_it
active_emphatics = self.emphatics_it
active_first_pers = self.first_pers_it
active_articles = self.articles_it
active_definite = self.definite_it
else:
active_demons = self.demons_set_en
active_deictics = self.deictics_set_en
active_emphatics = self.emphatics_en
active_first_pers = self.first_pers_en
active_articles = self.articles_en
active_definite = self.definite_en
counts = Counter(tokens_text)
noun_count = sum(1 for t in doc if t.pos_ in ("NOUN", "PROPN"))
pron_count = sum(1 for t in doc if t.pos_ == "PRON")
article_count = sum(1 for t in doc if t.text.lower() in active_articles)
dimostrativi_count = sum(1 for t in doc if t.text.lower() in active_demons)
definiti_count = sum(1 for t in doc if t.text.lower() in active_definite)
deictic_count = sum(1 for t in doc if t.text.lower() in active_deictics)
num_sents = len(sentences) if sentences else 1
hapax_total = sum(1.0 for w in counts if counts[w] == 1.0)
hapax_ratio = round(hapax_total / len(token_text_SP), 4) if token_text_SP else 0
# Ripetizioni (schema linguistico-agnostico: lavora sui token grezzi)
clean = re.sub(r'[^\w\s]', '', raw_text, flags=re.UNICODE).lower().split()
rep1, rep2, rep3 = 0, 0, 0
if len(clean) > 1:
for i in range(len(clean) - 1):
if clean[i] == clean[i + 1]: rep1 += 1
if len(clean) > 3:
for i in range(len(clean) - 3):
if clean[i:i+2] == clean[i+2:i+4]: rep2 += 1
if len(clean) > 5:
for i in range(len(clean) - 5):
if clean[i:i+3] == clean[i+3:i+6]: rep3 += 1
guiraud = len(set(lemmas)) / math.sqrt(len(lemmas)) if lemmas else 0
dvalue = d_value(raw_text, num_sents)[0]
dvalue_sballato = d_value_sballato(raw_text, num_sents)[0]
first_p_count = sum(1 for t in doc if t.text.lower() in active_first_pers)
pers_total = sum(1 for t in doc if t.pos_ == "PRON")
mrcdict = [(l, self.mrc.get(l, 0)) for l in lemmas if self.mrc.get(l, 0) > 0]
emph_count = sum(1 for t in doc if t.text.lower() in active_emphatics)
return {
"r1": round(sum(1 for l in lemmas if l in self.r1) / len(tokens_text) if tokens_text else 0, 4),
"r2": round(sum(1 for l in lemmas if l in self.r2) / len(tokens_text) if tokens_text else 0, 4),
"r3": round(sum(1 for l in lemmas if l in self.r3) / len(tokens_text) if tokens_text else 0, 4),
"other": max(0, len(lemmas) - sum(1 for l in lemmas if l in (self.r1 | self.r2 | self.r3))),
"concreteness": sum(self.mrc.get(l, 0) for l in lemmas) / len(mrcdict) if mrcdict else 0,
"hapax": hapax_total,
"hapax_ratio": hapax_ratio,
"pron_noun_ratio": round(pron_count / noun_count, 4) if noun_count > 0 else 0,
"reps_total": round((rep1 + rep2 + rep3) / len(sentences), 4) if sentences else 0,
"deictics": deictic_count,
"articles": article_count,
"pronouns": pron_count,
"nouns": noun_count,
"demonstratives": dimostrativi_count,
"definite_articles": definiti_count,
"demonstratives_ratio": round(dimostrativi_count / num_sents, 4) if num_sents > 0 else 0,
"definite_articles_ratio": round(definiti_count / num_sents, 4) if num_sents > 0 else 0,
"TTR": round(guiraud, 2),
"gunning_fog": round(dvalue, 2),
"indice_d_testo": round(dvalue_sballato / paragraphs, 2),
"HD-D": round(calculate_hdd(tokens_text), 2),
"emphatic_particles": round(emph_count / len(tokens_text), 4) if tokens_text else 0,
"first_person_ratio": round(first_p_count / pers_total, 4) if pers_total > 0 else 0,
"deictic_Frequency": round(deictic_count / (article_count + deictic_count), 4) if article_count + deictic_count > 0 else 0,
}
# ---------------------------------------------------------------------------
# Funzioni module-level (invariate rispetto all'originale)
# ---------------------------------------------------------------------------
def d_value(text, num_sents):
text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
num_sentences = num_sents
words = text.split()
num_words = len(words)
asl = num_words / num_sentences if num_sentences > 0 else 0
polysyllable_words = 0
vowel_pattern = re.compile(r'[aeiouy]+')
for word in words:
if len(vowel_pattern.findall(word)) >= 3:
polysyllable_words += 1
pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
return 0.4 * (asl + pw), asl, pw
def d_value_sballato(text, num_sents):
text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
words = text.split()
num_words = len(words)
polysyllable_words = 0
vowel_pattern = re.compile(r'[aeiouy]+')
for word in words:
if len(vowel_pattern.findall(word)) >= 3:
polysyllable_words += 1
pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
return 0.4 * (num_words + pw), num_words, pw
def calculate_hdd(tokens, n=42):
N = len(tokens)
if N < n:
return float(len(set(tokens)))
counts = Counter(tokens)
denom = math.comb(N, n)
hdd_sum = 0.0
for word in counts:
f_i = counts[word]
if (N - f_i) >= n:
prob_not_present = math.comb(N - f_i, n) / denom
else:
prob_not_present = 0.0
hdd_sum += (1.0 - prob_not_present)
return hdd_sum