Spaces:

amaisto
/

CO3

Running

App Files Files Community

CO3 / src /modules /lexical.py

amaisto

Upload 4 files

4fde834 verified 26 days ago

raw

history blame contribute delete

11.8 kB

	import re
	import math
	from collections import Counter


	class LexicalModule:
	def __init__(self, r1, r2, r3, mrc):
	self.r1, self.r2, self.r3, self.mrc = r1, r2, r3, mrc

	# --- Set INGLESI ---
	self.demons_set_en = {"this", "that", "these", "those"}
	self.deictics_set_en = {
	"i", "me", "my", "mine", "we", "us", "our", "ours",
	"you", "your", "yours", "he", "him", "his", "she",
	"her", "hers", "it", "its", "they", "them", "their", "theirs",
	"here", "there", "this", "these", "that", "those",
	"yonder", "above", "below", "in front", "behind",
	"now", "then", "today", "yesterday", "tomorrow", "tonight",
	"last week", "last month", "last year", "next week",
	"next month", "next year", "ago", "soon", "later", "currently",
	"sir", "madam", "ma'am", "your honor", "professor", "doctor",
	"the former", "the latter", "above-mentioned", "as follows", "hereby",
	"this way", "that time", "these days", "those years",
	"thus", "therefore", "hereby", "henceforth"
	}
	self.emphatics_en = {
	"just", "even", "so", "really", "absolutely",
	"quite", "indeed", "definitely", "certainly",
	"rather", "pretty", "precisely", "actually",
	"totally", "completely", "utterly", "extremely",
	"basically", "simply", "literally", "honestly",
	"frankly", "truly", "surely", "clearly", "obviously",
	"perfectly", "exactly", "merely", "only", "still",
	"yet", "already", "else", "ever", "never", "always",
	"very", "much", "too", "enough", "well", "oh",
	"wow", "hey", "like", "such", "right", "now",
	"then", "further", "more", "less", "super",
	"incredibly", "remarkably", "genuinely",
	"positively", "unbelievably",
	"extraordinarily", "immensely", "vastly",
	"somewhat", "kind of", "sort of",
	"pretty much", "practically", "essentially",
	"virtually", "nearly", "almost", "about",
	"roughly", "approximately", "potentially",
	"arguably", "seemingly", "apparently",
	"theoretically", "conceptually", "fundamentally",
	"relatively", "comparatively", "distinctly",
	"considerably", "significantly", "substantially",
	"surprisingly", "unexpectedly", "oddly",
	"strangely", "curiously", "ironically",
	"paradoxically", "undoubtedly", "indisputably", "categorically"
	}
	self.first_pers_en = {"i", "me", "myself"}
	self.articles_en = {"a", "an", "the", "some"}
	self.definite_en = {"the"}

	# --- Set ITALIANI ---
	self.demons_set_it = {
	"questo", "questa", "questi", "queste",
	"quello", "quella", "quelli", "quelle",
	"codesto", "codesta", "codesti", "codeste",
	"ciò", "tal", "tale", "tali"
	}
	self.deictics_set_it = {
	# Personali
	"io", "me", "mi", "mio", "mia", "miei", "mie",
	"noi", "ci", "nostro", "nostra", "nostri", "nostre",
	"tu", "te", "ti", "tuo", "tua", "tuoi", "tue",
	"voi", "vi", "vostro", "vostra", "vostri", "vostre",
	"lui", "lo", "gli", "suo", "sua", "suoi", "sue",
	"lei", "la", "loro", "li", "le",
	"essi", "esse", "sé", "si",
	# Spaziali
	"qui", "qua", "là", "lì", "lassù", "laggiù",
	"davanti", "dietro", "sopra", "sotto", "accanto",
	"vicino", "lontano", "oltre", "presso",
	# Temporali
	"ora", "adesso", "oggi", "ieri", "domani", "stanotte",
	"settimana scorsa", "mese scorso", "anno scorso",
	"prossima settimana", "prossimo mese", "prossimo anno",
	"presto", "tardi", "attualmente", "nel frattempo",
	# Sociali
	"signore", "signora", "professore", "dottore", "eccellenza",
	# Discorso
	"il suddetto", "il sottoscritto", "quanto sopra", "come segue",
	# Dimostrativi avverbiali
	"così", "pertanto", "quindi", "allora", "dunque"
	}
	self.emphatics_it = {
	"molto", "tanto", "assai", "parecchio", "alquanto",
	"enormemente", "immensamente", "incredibilmente", "straordinariamente",
	"oltremodo", "sommamente", "estremamente", "tremendamente",
	"proprio", "davvero", "veramente", "realmente", "effettivamente",
	"concretamente", "precisamente", "esattamente", "letteralmente",
	"certamente", "sicuramente", "assolutamente", "decisamente",
	"indubbiamente", "senz'altro", "senza dubbio", "ovviamente",
	"naturalmente", "chiaramente", "evidentemente",
	"abbastanza", "piuttosto", "relativamente",
	"praticamente", "quasi", "circa", "pressappoco", "sostanzialmente",
	"fondamentalmente", "essenzialmente", "in sostanza", "in fondo",
	"tutto sommato", "grosso modo", "più o meno",
	"solo", "soltanto", "solamente", "appena", "persino", "perfino",
	"addirittura", "almeno", "quanto meno", "perlomeno",
	"già", "ancora", "mai", "sempre", "pure", "anche",
	"insomma", "dopotutto", "in effetti", "anzi", "ecco",
	"beh", "bene", "cioè"
	}
	self.first_pers_it = {"io", "me", "mi", "me stesso", "me stessa"}
	# Articoli italiani (determinativi + indeterminativi comuni)
	self.articles_it = {"il", "lo", "la", "i", "gli", "le", "un", "uno", "una", "un'"}
	self.definite_it = {"il", "lo", "la", "i", "gli", "le"}

	def analyze(self, doc, lemmas, tokens_text, token_text_SP, raw_text, sentences, paragraphs, lang="en"):
	# Seleziona i set in base alla lingua
	if lang == "it":
	active_demons = self.demons_set_it
	active_deictics = self.deictics_set_it
	active_emphatics = self.emphatics_it
	active_first_pers = self.first_pers_it
	active_articles = self.articles_it
	active_definite = self.definite_it
	else:
	active_demons = self.demons_set_en
	active_deictics = self.deictics_set_en
	active_emphatics = self.emphatics_en
	active_first_pers = self.first_pers_en
	active_articles = self.articles_en
	active_definite = self.definite_en

	counts = Counter(tokens_text)
	noun_count = sum(1 for t in doc if t.pos_ in ("NOUN", "PROPN"))
	pron_count = sum(1 for t in doc if t.pos_ == "PRON")

	article_count = sum(1 for t in doc if t.text.lower() in active_articles)
	dimostrativi_count = sum(1 for t in doc if t.text.lower() in active_demons)
	definiti_count = sum(1 for t in doc if t.text.lower() in active_definite)
	deictic_count = sum(1 for t in doc if t.text.lower() in active_deictics)

	num_sents = len(sentences) if sentences else 1
	hapax_total = sum(1.0 for w in counts if counts[w] == 1.0)
	hapax_ratio = round(hapax_total / len(token_text_SP), 4) if token_text_SP else 0

	# Ripetizioni (schema linguistico-agnostico: lavora sui token grezzi)
	clean = re.sub(r'[^\w\s]', '', raw_text, flags=re.UNICODE).lower().split()
	rep1, rep2, rep3 = 0, 0, 0
	if len(clean) > 1:
	for i in range(len(clean) - 1):
	if clean[i] == clean[i + 1]: rep1 += 1
	if len(clean) > 3:
	for i in range(len(clean) - 3):
	if clean[i:i+2] == clean[i+2:i+4]: rep2 += 1
	if len(clean) > 5:
	for i in range(len(clean) - 5):
	if clean[i:i+3] == clean[i+3:i+6]: rep3 += 1

	guiraud = len(set(lemmas)) / math.sqrt(len(lemmas)) if lemmas else 0
	dvalue = d_value(raw_text, num_sents)[0]
	dvalue_sballato = d_value_sballato(raw_text, num_sents)[0]

	first_p_count = sum(1 for t in doc if t.text.lower() in active_first_pers)
	pers_total = sum(1 for t in doc if t.pos_ == "PRON")
	mrcdict = [(l, self.mrc.get(l, 0)) for l in lemmas if self.mrc.get(l, 0) > 0]
	emph_count = sum(1 for t in doc if t.text.lower() in active_emphatics)

	return {
	"r1": round(sum(1 for l in lemmas if l in self.r1) / len(tokens_text) if tokens_text else 0, 4),
	"r2": round(sum(1 for l in lemmas if l in self.r2) / len(tokens_text) if tokens_text else 0, 4),
	"r3": round(sum(1 for l in lemmas if l in self.r3) / len(tokens_text) if tokens_text else 0, 4),
	"other": max(0, len(lemmas) - sum(1 for l in lemmas if l in (self.r1 \| self.r2 \| self.r3))),
	"concreteness": sum(self.mrc.get(l, 0) for l in lemmas) / len(mrcdict) if mrcdict else 0,
	"hapax": hapax_total,
	"hapax_ratio": hapax_ratio,
	"pron_noun_ratio": round(pron_count / noun_count, 4) if noun_count > 0 else 0,
	"reps_total": round((rep1 + rep2 + rep3) / len(sentences), 4) if sentences else 0,
	"deictics": deictic_count,
	"articles": article_count,
	"pronouns": pron_count,
	"nouns": noun_count,
	"demonstratives": dimostrativi_count,
	"definite_articles": definiti_count,
	"demonstratives_ratio": round(dimostrativi_count / num_sents, 4) if num_sents > 0 else 0,
	"definite_articles_ratio": round(definiti_count / num_sents, 4) if num_sents > 0 else 0,
	"TTR": round(guiraud, 2),
	"gunning_fog": round(dvalue, 2),
	"indice_d_testo": round(dvalue_sballato / paragraphs, 2),
	"HD-D": round(calculate_hdd(tokens_text), 2),
	"emphatic_particles": round(emph_count / len(tokens_text), 4) if tokens_text else 0,
	"first_person_ratio": round(first_p_count / pers_total, 4) if pers_total > 0 else 0,
	"deictic_Frequency": round(deictic_count / (article_count + deictic_count), 4) if article_count + deictic_count > 0 else 0,
	}


	# ---------------------------------------------------------------------------
	# Funzioni module-level (invariate rispetto all'originale)
	# ---------------------------------------------------------------------------

	def d_value(text, num_sents):
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
	num_sentences = num_sents
	words = text.split()
	num_words = len(words)
	asl = num_words / num_sentences if num_sentences > 0 else 0

	polysyllable_words = 0
	vowel_pattern = re.compile(r'[aeiouy]+')
	for word in words:
	if len(vowel_pattern.findall(word)) >= 3:
	polysyllable_words += 1

	pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
	return 0.4 * (asl + pw), asl, pw


	def d_value_sballato(text, num_sents):
	text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
	words = text.split()
	num_words = len(words)

	polysyllable_words = 0
	vowel_pattern = re.compile(r'[aeiouy]+')
	for word in words:
	if len(vowel_pattern.findall(word)) >= 3:
	polysyllable_words += 1

	pw = (polysyllable_words / num_words * 100) if num_words > 0 else 0
	return 0.4 * (num_words + pw), num_words, pw


	def calculate_hdd(tokens, n=42):
	N = len(tokens)
	if N < n:
	return float(len(set(tokens)))

	counts = Counter(tokens)
	denom = math.comb(N, n)
	hdd_sum = 0.0

	for word in counts:
	f_i = counts[word]
	if (N - f_i) >= n:
	prob_not_present = math.comb(N - f_i, n) / denom
	else:
	prob_not_present = 0.0
	hdd_sum += (1.0 - prob_not_present)

	return hdd_sum