| import re |
| import math |
| from collections import Counter |
|
|
|
|
| class LexicalModule: |
| def __init__(self, r1, r2, r3, mrc): |
| self.r1, self.r2, self.r3, self.mrc = r1, r2, r3, mrc |
|
|
| |
| self.demons_set_en = {"this", "that", "these", "those"} |
| self.deictics_set_en = { |
| "i", "me", "my", "mine", "we", "us", "our", "ours", |
| "you", "your", "yours", "he", "him", "his", "she", |
| "her", "hers", "it", "its", "they", "them", "their", "theirs", |
| "here", "there", "this", "these", "that", "those", |
| "yonder", "above", "below", "in front", "behind", |
| "now", "then", "today", "yesterday", "tomorrow", "tonight", |
| "last week", "last month", "last year", "next week", |
| "next month", "next year", "ago", "soon", "later", "currently", |
| "sir", "madam", "ma'am", "your honor", "professor", "doctor", |
| "the former", "the latter", "above-mentioned", "as follows", "hereby", |
| "this way", "that time", "these days", "those years", |
| "thus", "therefore", "hereby", "henceforth" |
| } |
| self.emphatics_en = { |
| "just", "even", "so", "really", "absolutely", |
| "quite", "indeed", "definitely", "certainly", |
| "rather", "pretty", "precisely", "actually", |
| "totally", "completely", "utterly", "extremely", |
| "basically", "simply", "literally", "honestly", |
| "frankly", "truly", "surely", "clearly", "obviously", |
| "perfectly", "exactly", "merely", "only", "still", |
| "yet", "already", "else", "ever", "never", "always", |
| "very", "much", "too", "enough", "well", "oh", |
| "wow", "hey", "like", "such", "right", "now", |
| "then", "further", "more", "less", "super", |
| "incredibly", "remarkably", "genuinely", |
| "positively", "unbelievably", |
| "extraordinarily", "immensely", "vastly", |
| "somewhat", "kind of", "sort of", |
| "pretty much", "practically", "essentially", |
| "virtually", "nearly", "almost", "about", |
| "roughly", "approximately", "potentially", |
| "arguably", "seemingly", "apparently", |
| "theoretically", "conceptually", "fundamentally", |
| "relatively", "comparatively", "distinctly", |
| "considerably", "significantly", "substantially", |
| "surprisingly", "unexpectedly", "oddly", |
| "strangely", "curiously", "ironically", |
| "paradoxically", "undoubtedly", "indisputably", "categorically" |
| } |
| self.first_pers_en = {"i", "me", "myself"} |
| self.articles_en = {"a", "an", "the", "some"} |
| self.definite_en = {"the"} |
|
|
| |
| self.demons_set_it = { |
| "questo", "questa", "questi", "queste", |
| "quello", "quella", "quelli", "quelle", |
| "codesto", "codesta", "codesti", "codeste", |
| "ciò", "tal", "tale", "tali" |
| } |
| self.deictics_set_it = { |
| |
| "io", "me", "mi", "mio", "mia", "miei", "mie", |
| "noi", "ci", "nostro", "nostra", "nostri", "nostre", |
| "tu", "te", "ti", "tuo", "tua", "tuoi", "tue", |
| "voi", "vi", "vostro", "vostra", "vostri", "vostre", |
| "lui", "lo", "gli", "suo", "sua", "suoi", "sue", |
| "lei", "la", "loro", "li", "le", |
| "essi", "esse", "sé", "si", |
| |
| "qui", "qua", "là", "lì", "lassù", "laggiù", |
| "davanti", "dietro", "sopra", "sotto", "accanto", |
| "vicino", "lontano", "oltre", "presso", |
| |
| "ora", "adesso", "oggi", "ieri", "domani", "stanotte", |
| "settimana scorsa", "mese scorso", "anno scorso", |
| "prossima settimana", "prossimo mese", "prossimo anno", |
| "presto", "tardi", "attualmente", "nel frattempo", |
| |
| "signore", "signora", "professore", "dottore", "eccellenza", |
| |
| "il suddetto", "il sottoscritto", "quanto sopra", "come segue", |
| |
| "così", "pertanto", "quindi", "allora", "dunque" |
| } |
| self.emphatics_it = { |
| "molto", "tanto", "assai", "parecchio", "alquanto", |
| "enormemente", "immensamente", "incredibilmente", "straordinariamente", |
| "oltremodo", "sommamente", "estremamente", "tremendamente", |
| "proprio", "davvero", "veramente", "realmente", "effettivamente", |
| "concretamente", "precisamente", "esattamente", "letteralmente", |
| "certamente", "sicuramente", "assolutamente", "decisamente", |
| "indubbiamente", "senz'altro", "senza dubbio", "ovviamente", |
| "naturalmente", "chiaramente", "evidentemente", |
| "abbastanza", "piuttosto", "relativamente", |
| "praticamente", "quasi", "circa", "pressappoco", "sostanzialmente", |
| "fondamentalmente", "essenzialmente", "in sostanza", "in fondo", |
| "tutto sommato", "grosso modo", "più o meno", |
| "solo", "soltanto", "solamente", "appena", "persino", "perfino", |
| "addirittura", "almeno", "quanto meno", "perlomeno", |
| "già", "ancora", "mai", "sempre", "pure", "anche", |
| "insomma", "dopotutto", "in effetti", "anzi", "ecco", |
| "beh", "bene", "cioè" |
| } |
| self.first_pers_it = {"io", "me", "mi", "me stesso", "me stessa"} |
| |
| self.articles_it = {"il", "lo", "la", "i", "gli", "le", "un", "uno", "una", "un'"} |
| self.definite_it = {"il", "lo", "la", "i", "gli", "le"} |
|
|
| def analyze(self, doc, lemmas, tokens_text, token_text_SP, raw_text, sentences, paragraphs, lang="en"): |
| |
| if lang == "it": |
| active_demons = self.demons_set_it |
| active_deictics = self.deictics_set_it |
| active_emphatics = self.emphatics_it |
| active_first_pers = self.first_pers_it |
| active_articles = self.articles_it |
| active_definite = self.definite_it |
| else: |
| active_demons = self.demons_set_en |
| active_deictics = self.deictics_set_en |
| active_emphatics = self.emphatics_en |
| active_first_pers = self.first_pers_en |
| active_articles = self.articles_en |
| active_definite = self.definite_en |
|
|
| counts = Counter(tokens_text) |
| noun_count = sum(1 for t in doc if t.pos_ in ("NOUN", "PROPN")) |
| pron_count = sum(1 for t in doc if t.pos_ == "PRON") |
|
|
| article_count = sum(1 for t in doc if t.text.lower() in active_articles) |
| dimostrativi_count = sum(1 for t in doc if t.text.lower() in active_demons) |
| definiti_count = sum(1 for t in doc if t.text.lower() in active_definite) |
| deictic_count = sum(1 for t in doc if t.text.lower() in active_deictics) |
|
|
| num_sents = len(sentences) if sentences else 1 |
| hapax_total = sum(1.0 for w in counts if counts[w] == 1.0) |
| hapax_ratio = round(hapax_total / len(token_text_SP), 4) if token_text_SP else 0 |
|
|
| |
| clean = re.sub(r'[^\w\s]', '', raw_text, flags=re.UNICODE).lower().split() |
| rep1, rep2, rep3 = 0, 0, 0 |
| if len(clean) > 1: |
| for i in range(len(clean) - 1): |
| if clean[i] == clean[i + 1]: rep1 += 1 |
| if len(clean) > 3: |
| for i in range(len(clean) - 3): |
| if clean[i:i+2] == clean[i+2:i+4]: rep2 += 1 |
| if len(clean) > 5: |
| for i in range(len(clean) - 5): |
| if clean[i:i+3] == clean[i+3:i+6]: rep3 += 1 |
|
|
| guiraud = len(set(lemmas)) / math.sqrt(len(lemmas)) if lemmas else 0 |
| dvalue = d_value(raw_text, num_sents)[0] |
| dvalue_sballato = d_value_sballato(raw_text, num_sents)[0] |
|
|
| first_p_count = sum(1 for t in doc if t.text.lower() in active_first_pers) |
| pers_total = sum(1 for t in doc if t.pos_ == "PRON") |
| mrcdict = [(l, self.mrc.get(l, 0)) for l in lemmas if self.mrc.get(l, 0) > 0] |
| emph_count = sum(1 for t in doc if t.text.lower() in active_emphatics) |
|
|
| return { |
| "r1": round(sum(1 for l in lemmas if l in self.r1) / len(tokens_text) if tokens_text else 0, 4), |
| "r2": round(sum(1 for l in lemmas if l in self.r2) / len(tokens_text) if tokens_text else 0, 4), |
| "r3": round(sum(1 for l in lemmas if l in self.r3) / len(tokens_text) if tokens_text else 0, 4), |
| "other": max(0, len(lemmas) - sum(1 for l in lemmas if l in (self.r1 | self.r2 | self.r3))), |
| "concreteness": sum(self.mrc.get(l, 0) for l in lemmas) / len(mrcdict) if mrcdict else 0, |
| "hapax": hapax_total, |
| "hapax_ratio": hapax_ratio, |
| "pron_noun_ratio": round(pron_count / noun_count, 4) if noun_count > 0 else 0, |
| "reps_total": round((rep1 + rep2 + rep3) / len(sentences), 4) if sentences else 0, |
| "deictics": deictic_count, |
| "articles": article_count, |
| "pronouns": pron_count, |
| "nouns": noun_count, |
| "demonstratives": dimostrativi_count, |
| "definite_articles": definiti_count, |
| "demonstratives_ratio": round(dimostrativi_count / num_sents, 4) if num_sents > 0 else 0, |
| "definite_articles_ratio": round(definiti_count / num_sents, 4) if num_sents > 0 else 0, |
| "TTR": round(guiraud, 2), |
| "gunning_fog": round(dvalue, 2), |
| "indice_d_testo": round(dvalue_sballato / paragraphs, 2), |
| "HD-D": round(calculate_hdd(tokens_text), 2), |
| "emphatic_particles": round(emph_count / len(tokens_text), 4) if tokens_text else 0, |
| "first_person_ratio": round(first_p_count / pers_total, 4) if pers_total > 0 else 0, |
| "deictic_Frequency": round(deictic_count / (article_count + deictic_count), 4) if article_count + deictic_count > 0 else 0, |
| } |
|
|
|
|
| |
| |
| |
|
|
| def d_value(text, num_sents): |
| text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower() |
| num_sentences = num_sents |
| words = text.split() |
| num_words = len(words) |
| asl = num_words / num_sentences if num_sentences > 0 else 0 |
|
|
| polysyllable_words = 0 |
| vowel_pattern = re.compile(r'[aeiouy]+') |
| for word in words: |
| if len(vowel_pattern.findall(word)) >= 3: |
| polysyllable_words += 1 |
|
|
| pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0 |
| return 0.4 * (asl + pw), asl, pw |
|
|
|
|
| def d_value_sballato(text, num_sents): |
| text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower() |
| words = text.split() |
| num_words = len(words) |
|
|
| polysyllable_words = 0 |
| vowel_pattern = re.compile(r'[aeiouy]+') |
| for word in words: |
| if len(vowel_pattern.findall(word)) >= 3: |
| polysyllable_words += 1 |
|
|
| pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0 |
| return 0.4 * (num_words + pw), num_words, pw |
|
|
|
|
| def calculate_hdd(tokens, n=42): |
| N = len(tokens) |
| if N < n: |
| return float(len(set(tokens))) |
|
|
| counts = Counter(tokens) |
| denom = math.comb(N, n) |
| hdd_sum = 0.0 |
|
|
| for word in counts: |
| f_i = counts[word] |
| if (N - f_i) >= n: |
| prob_not_present = math.comb(N - f_i, n) / denom |
| else: |
| prob_not_present = 0.0 |
| hdd_sum += (1.0 - prob_not_present) |
|
|
| return hdd_sum |
|
|