import re import math from collections import Counter class LexicalModule: def __init__(self, r1, r2, r3, mrc): self.r1, self.r2, self.r3, self.mrc = r1, r2, r3, mrc # --- Set INGLESI --- self.demons_set_en = {"this", "that", "these", "those"} self.deictics_set_en = { "i", "me", "my", "mine", "we", "us", "our", "ours", "you", "your", "yours", "he", "him", "his", "she", "her", "hers", "it", "its", "they", "them", "their", "theirs", "here", "there", "this", "these", "that", "those", "yonder", "above", "below", "in front", "behind", "now", "then", "today", "yesterday", "tomorrow", "tonight", "last week", "last month", "last year", "next week", "next month", "next year", "ago", "soon", "later", "currently", "sir", "madam", "ma'am", "your honor", "professor", "doctor", "the former", "the latter", "above-mentioned", "as follows", "hereby", "this way", "that time", "these days", "those years", "thus", "therefore", "hereby", "henceforth" } self.emphatics_en = { "just", "even", "so", "really", "absolutely", "quite", "indeed", "definitely", "certainly", "rather", "pretty", "precisely", "actually", "totally", "completely", "utterly", "extremely", "basically", "simply", "literally", "honestly", "frankly", "truly", "surely", "clearly", "obviously", "perfectly", "exactly", "merely", "only", "still", "yet", "already", "else", "ever", "never", "always", "very", "much", "too", "enough", "well", "oh", "wow", "hey", "like", "such", "right", "now", "then", "further", "more", "less", "super", "incredibly", "remarkably", "genuinely", "positively", "unbelievably", "extraordinarily", "immensely", "vastly", "somewhat", "kind of", "sort of", "pretty much", "practically", "essentially", "virtually", "nearly", "almost", "about", "roughly", "approximately", "potentially", "arguably", "seemingly", "apparently", "theoretically", "conceptually", "fundamentally", "relatively", "comparatively", "distinctly", "considerably", "significantly", "substantially", "surprisingly", "unexpectedly", "oddly", "strangely", "curiously", "ironically", "paradoxically", "undoubtedly", "indisputably", "categorically" } self.first_pers_en = {"i", "me", "myself"} self.articles_en = {"a", "an", "the", "some"} self.definite_en = {"the"} # --- Set ITALIANI --- self.demons_set_it = { "questo", "questa", "questi", "queste", "quello", "quella", "quelli", "quelle", "codesto", "codesta", "codesti", "codeste", "ciò", "tal", "tale", "tali" } self.deictics_set_it = { # Personali "io", "me", "mi", "mio", "mia", "miei", "mie", "noi", "ci", "nostro", "nostra", "nostri", "nostre", "tu", "te", "ti", "tuo", "tua", "tuoi", "tue", "voi", "vi", "vostro", "vostra", "vostri", "vostre", "lui", "lo", "gli", "suo", "sua", "suoi", "sue", "lei", "la", "loro", "li", "le", "essi", "esse", "sé", "si", # Spaziali "qui", "qua", "là", "lì", "lassù", "laggiù", "davanti", "dietro", "sopra", "sotto", "accanto", "vicino", "lontano", "oltre", "presso", # Temporali "ora", "adesso", "oggi", "ieri", "domani", "stanotte", "settimana scorsa", "mese scorso", "anno scorso", "prossima settimana", "prossimo mese", "prossimo anno", "presto", "tardi", "attualmente", "nel frattempo", # Sociali "signore", "signora", "professore", "dottore", "eccellenza", # Discorso "il suddetto", "il sottoscritto", "quanto sopra", "come segue", # Dimostrativi avverbiali "così", "pertanto", "quindi", "allora", "dunque" } self.emphatics_it = { "molto", "tanto", "assai", "parecchio", "alquanto", "enormemente", "immensamente", "incredibilmente", "straordinariamente", "oltremodo", "sommamente", "estremamente", "tremendamente", "proprio", "davvero", "veramente", "realmente", "effettivamente", "concretamente", "precisamente", "esattamente", "letteralmente", "certamente", "sicuramente", "assolutamente", "decisamente", "indubbiamente", "senz'altro", "senza dubbio", "ovviamente", "naturalmente", "chiaramente", "evidentemente", "abbastanza", "piuttosto", "relativamente", "praticamente", "quasi", "circa", "pressappoco", "sostanzialmente", "fondamentalmente", "essenzialmente", "in sostanza", "in fondo", "tutto sommato", "grosso modo", "più o meno", "solo", "soltanto", "solamente", "appena", "persino", "perfino", "addirittura", "almeno", "quanto meno", "perlomeno", "già", "ancora", "mai", "sempre", "pure", "anche", "insomma", "dopotutto", "in effetti", "anzi", "ecco", "beh", "bene", "cioè" } self.first_pers_it = {"io", "me", "mi", "me stesso", "me stessa"} # Articoli italiani (determinativi + indeterminativi comuni) self.articles_it = {"il", "lo", "la", "i", "gli", "le", "un", "uno", "una", "un'"} self.definite_it = {"il", "lo", "la", "i", "gli", "le"} def analyze(self, doc, lemmas, tokens_text, token_text_SP, raw_text, sentences, paragraphs, lang="en"): # Seleziona i set in base alla lingua if lang == "it": active_demons = self.demons_set_it active_deictics = self.deictics_set_it active_emphatics = self.emphatics_it active_first_pers = self.first_pers_it active_articles = self.articles_it active_definite = self.definite_it else: active_demons = self.demons_set_en active_deictics = self.deictics_set_en active_emphatics = self.emphatics_en active_first_pers = self.first_pers_en active_articles = self.articles_en active_definite = self.definite_en counts = Counter(tokens_text) noun_count = sum(1 for t in doc if t.pos_ in ("NOUN", "PROPN")) pron_count = sum(1 for t in doc if t.pos_ == "PRON") article_count = sum(1 for t in doc if t.text.lower() in active_articles) dimostrativi_count = sum(1 for t in doc if t.text.lower() in active_demons) definiti_count = sum(1 for t in doc if t.text.lower() in active_definite) deictic_count = sum(1 for t in doc if t.text.lower() in active_deictics) num_sents = len(sentences) if sentences else 1 hapax_total = sum(1.0 for w in counts if counts[w] == 1.0) hapax_ratio = round(hapax_total / len(token_text_SP), 4) if token_text_SP else 0 # Ripetizioni (schema linguistico-agnostico: lavora sui token grezzi) clean = re.sub(r'[^\w\s]', '', raw_text, flags=re.UNICODE).lower().split() rep1, rep2, rep3 = 0, 0, 0 if len(clean) > 1: for i in range(len(clean) - 1): if clean[i] == clean[i + 1]: rep1 += 1 if len(clean) > 3: for i in range(len(clean) - 3): if clean[i:i+2] == clean[i+2:i+4]: rep2 += 1 if len(clean) > 5: for i in range(len(clean) - 5): if clean[i:i+3] == clean[i+3:i+6]: rep3 += 1 guiraud = len(set(lemmas)) / math.sqrt(len(lemmas)) if lemmas else 0 dvalue = d_value(raw_text, num_sents)[0] dvalue_sballato = d_value_sballato(raw_text, num_sents)[0] first_p_count = sum(1 for t in doc if t.text.lower() in active_first_pers) pers_total = sum(1 for t in doc if t.pos_ == "PRON") mrcdict = [(l, self.mrc.get(l, 0)) for l in lemmas if self.mrc.get(l, 0) > 0] emph_count = sum(1 for t in doc if t.text.lower() in active_emphatics) return { "r1": round(sum(1 for l in lemmas if l in self.r1) / len(tokens_text) if tokens_text else 0, 4), "r2": round(sum(1 for l in lemmas if l in self.r2) / len(tokens_text) if tokens_text else 0, 4), "r3": round(sum(1 for l in lemmas if l in self.r3) / len(tokens_text) if tokens_text else 0, 4), "other": max(0, len(lemmas) - sum(1 for l in lemmas if l in (self.r1 | self.r2 | self.r3))), "concreteness": sum(self.mrc.get(l, 0) for l in lemmas) / len(mrcdict) if mrcdict else 0, "hapax": hapax_total, "hapax_ratio": hapax_ratio, "pron_noun_ratio": round(pron_count / noun_count, 4) if noun_count > 0 else 0, "reps_total": round((rep1 + rep2 + rep3) / len(sentences), 4) if sentences else 0, "deictics": deictic_count, "articles": article_count, "pronouns": pron_count, "nouns": noun_count, "demonstratives": dimostrativi_count, "definite_articles": definiti_count, "demonstratives_ratio": round(dimostrativi_count / num_sents, 4) if num_sents > 0 else 0, "definite_articles_ratio": round(definiti_count / num_sents, 4) if num_sents > 0 else 0, "TTR": round(guiraud, 2), "gunning_fog": round(dvalue, 2), "indice_d_testo": round(dvalue_sballato / paragraphs, 2), "HD-D": round(calculate_hdd(tokens_text), 2), "emphatic_particles": round(emph_count / len(tokens_text), 4) if tokens_text else 0, "first_person_ratio": round(first_p_count / pers_total, 4) if pers_total > 0 else 0, "deictic_Frequency": round(deictic_count / (article_count + deictic_count), 4) if article_count + deictic_count > 0 else 0, } # --------------------------------------------------------------------------- # Funzioni module-level (invariate rispetto all'originale) # --------------------------------------------------------------------------- def d_value(text, num_sents): text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower() num_sentences = num_sents words = text.split() num_words = len(words) asl = num_words / num_sentences if num_sentences > 0 else 0 polysyllable_words = 0 vowel_pattern = re.compile(r'[aeiouy]+') for word in words: if len(vowel_pattern.findall(word)) >= 3: polysyllable_words += 1 pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0 return 0.4 * (asl + pw), asl, pw def d_value_sballato(text, num_sents): text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower() words = text.split() num_words = len(words) polysyllable_words = 0 vowel_pattern = re.compile(r'[aeiouy]+') for word in words: if len(vowel_pattern.findall(word)) >= 3: polysyllable_words += 1 pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0 return 0.4 * (num_words + pw), num_words, pw def calculate_hdd(tokens, n=42): N = len(tokens) if N < n: return float(len(set(tokens))) counts = Counter(tokens) denom = math.comb(N, n) hdd_sum = 0.0 for word in counts: f_i = counts[word] if (N - f_i) >= n: prob_not_present = math.comb(N - f_i, n) / denom else: prob_not_present = 0.0 hdd_sum += (1.0 - prob_not_present) return hdd_sum