| import math |
| import re |
|
|
| import numpy as np |
| from sklearn.metrics.pairwise import cosine_similarity |
| from sentence_transformers import SentenceTransformer |
|
|
|
|
| class CohesionModule: |
| def __init__(self): |
| self.model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| self.conn_map_en = { |
| "AdPos": ["also", "moreover", "furthermore", "in addition", "plus", |
| "equally", "likewise", "similarly", "as well", "alike", |
| "additionally", "what's more", "not only", "coupled with", |
| "along with", "including", "together with", "in conjunction with", |
| "along the same lines", "correspondingly"], |
| "AdNeg": ["instead", "conversely", "rather", "on the contrary", |
| "alternatively", "nonetheless", "however", "nevertheless", |
| "by contrast", "on the other hand", "in contrast", "unlike", |
| "despite this", "notwithstanding", "even so", "still", |
| "be that as it may", "in spite of", "rather", "whereas"], |
| "CausPos": ["therefore", "hence", "thus", "consequently", "accordingly", |
| "so", "as a result", "because of this", "for this reason", |
| "since", "given that", "inasmuch as", "seeing that", "seeing as", |
| "in view of", "in light of", "due to", "owing to", "stemming from", "insofar as"], |
| "CausNeg": ["nevertheless", "however", "despite", "in spite of", |
| "notwithstanding", "still", "yet", "albeit", |
| "even though", "although", "whereas", "while", |
| "in contrast", "on the other hand", "conversely", |
| "be that as it may", "nonetheless", "regardless", |
| "irrespective of", "granted that"], |
| "TempPos": ["then", "subsequently", "afterwards", "later", "next", |
| "thereafter", "following this", "after that", "meanwhile", |
| "subsequently", "instantly", "immediately", "directly", |
| "thereupon", "forthwith", "straightaway", "presently", |
| "quickly", "in due course", "momentarily"], |
| "TempNeg": ["previously", "beforehand", "earlier", "prior", |
| "until", "before", "ahead of time", |
| "in advance", "beforehand", "previously", "heretofore", |
| "ere", "antecedently", "in anticipation", "preparatory", |
| "in preparation", "in readiness", "in prospect", "preemptively"], |
| "LogPos": ["indeed", "actually", "in fact", "clearly", |
| "evidently", "obviously", "certainly", "surely", |
| "undoubtedly", "unquestionably", "without doubt", |
| "assuredly", "definitely", "absolutely", "positively", |
| "categorically", "decidedly", "indubitably", "unmistakably", "truly"], |
| "LogNeg": ["conversely", "on the contrary", "inversely", |
| "contrariwise", "alternatively", "rather", |
| "by contrast", "in contrast", "on the other hand", |
| "oppositely", "counter to", "in opposition", |
| "the other way around", "in a different light", |
| "from another perspective", "viewed differently", |
| "seen from another angle", "to the contrary", |
| "in another sense", "constructively"] |
| } |
|
|
| self.conn_map_it = { |
| "AdPos": ["inoltre", "in aggiunta", "ugualmente", "analogamente", "allo stesso modo", |
| "parimenti", "similmente", "per di più", "oltretutto", "altresì", |
| "anche", "pure", "persino", "perfino", "nonché", |
| "oltre a ciò", "a questo si aggiunge", "non solo", "insieme a", |
| "in più", "tanto quanto", "al pari di", "così come", "d'altronde"], |
| "AdNeg": ["invece", "al contrario", "per contro", "d'altro canto", "viceversa", |
| "tuttavia", "però", "eppure", "nondimeno", "ciononostante", |
| "nonostante ciò", "al tempo stesso", "in contrasto", "a differenza di", |
| "diversamente", "contrariamente", "piuttosto", "in realtà", "anzi", |
| "bensì", "ma", "senonché", "fatta eccezione per", "a meno che"], |
| "CausPos": ["quindi", "pertanto", "di conseguenza", "dunque", "perciò", |
| "per questo motivo", "per tale ragione", "così", "ecco perché", |
| "ne consegue che", "da ciò deriva", "in virtù di ciò", "a seguito di", |
| "per effetto di", "grazie a", "poiché", "dato che", "siccome", |
| "visto che", "dal momento che"], |
| "CausNeg": ["nonostante", "sebbene", "benché", "malgrado", "pur", |
| "anche se", "quantunque", "per quanto", "a dispetto di", |
| "in barba a", "con tutto che", "pur tuttavia", "nel caso in cui", |
| "fermo restando che", "ciononostante", "neppure così", "eppure", |
| "tuttavia", "nondimeno", "ugualmente"], |
| "TempPos": ["poi", "successivamente", "in seguito", "dopo", "dopodiché", |
| "in seguito a", "più tardi", "nel frattempo", "subito dopo", |
| "immediatamente", "a breve", "poco dopo", "contemporaneamente", |
| "nel contempo", "alla fine", "infine", "per ultimo", "da quel momento", |
| "a partire da allora", "in quello stesso istante"], |
| "TempNeg": ["prima", "precedentemente", "in anticipo", "in precedenza", "già", |
| "tempo prima", "in passato", "fino a quel momento", "sino ad allora", |
| "antecedentemente", "prima ancora", "fin dall'inizio", "ab initio", |
| "precocemente", "fin da subito", "in anticipo su", "prima di", |
| "anteriormente", "per l'appunto prima", "al momento precedente"], |
| "LogPos": ["infatti", "in effetti", "certamente", "ovviamente", "naturalmente", |
| "di certo", "senza dubbio", "evidentemente", "chiaramente", |
| "indubbiamente", "innegabilmente", "senz'altro", "con certezza", |
| "in verità", "invero", "difatti", "com'è noto", "come si sa", |
| "come è ovvio", "è indubbio che"], |
| "LogNeg": ["al contrario", "per contro", "inversamente", "d'altra parte", |
| "in senso opposto", "in modo inverso", "ribaltando il ragionamento", |
| "al posto di", "piuttosto che", "contrariamente a quanto detto", |
| "in modo contrario", "a rovescio", "viceversa", "nel senso opposto", |
| "cambiando prospettiva", "vista da un altro angolo", |
| "in tutt'altro senso", "in senso contrario", "anziché", "in alternativa"] |
| } |
|
|
| def _get_jaccard(self, sets): |
| """Calcola l'indice di Jaccard per una lista di N set.""" |
| if not all(sets): |
| return 0.0 |
| intersection = set.intersection(*sets) |
| union = set.union(*sets) |
| return len(intersection) / len(union) if len(union) > 0 else 0.0 |
|
|
| def analyze(self, doc, sentences, tokens_text, token_text_SP, lang="en"): |
| |
| self.conn_map = self.conn_map_it if lang == "it" else self.conn_map_en |
|
|
| |
| paragraphs = [] |
| current_para = [] |
| for sent in sentences: |
| current_para.append(sent) |
| if "\n" in sent.text_with_ws: |
| paragraphs.append(current_para) |
| current_para = [] |
| if current_para: |
| paragraphs.append(current_para) |
|
|
| |
| lex_scores_2, lex_scores_3 = [], [] |
| sem_scores_sent, sem_scores_para = [], [] |
| para_embeddings = [] |
|
|
| |
| for paragraph in paragraphs: |
| if len(paragraph) < 2: |
| para_embeddings.append(self.model.encode(paragraph[0].text)) |
| continue |
|
|
| target_pos = {"NOUN", "VERB", "ADJ", "PROPN"} |
| sent_lemmas = [ |
| {t.lemma_.lower() for t in s if t.pos_ in target_pos and not t.is_punct} |
| for s in paragraph |
| ] |
|
|
| |
| for i in range(len(sent_lemmas) - 1): |
| lex_scores_2.append(self._get_jaccard([sent_lemmas[i], sent_lemmas[i + 1]])) |
|
|
| if len(sent_lemmas) >= 3: |
| for i in range(len(sent_lemmas) - 2): |
| lex_scores_3.append(self._get_jaccard([sent_lemmas[i], sent_lemmas[i + 1], sent_lemmas[i + 2]])) |
|
|
| |
| texts = [s.text for s in paragraph] |
| sent_embs = self.model.encode(texts) |
| for i in range(len(sent_embs) - 1): |
| sim = cosine_similarity(sent_embs[i].reshape(1, -1), sent_embs[i + 1].reshape(1, -1))[0][0] |
| sem_scores_sent.append(float(sim)) |
|
|
| para_full_text = " ".join(texts) |
| para_embeddings.append(self.model.encode(para_full_text)) |
|
|
| |
| if len(para_embeddings) > 1: |
| for i in range(len(para_embeddings) - 1): |
| sim = cosine_similarity(para_embeddings[i].reshape(1, -1), para_embeddings[i + 1].reshape(1, -1))[0][0] |
| sem_scores_para.append(float(sim)) |
|
|
| full_text_lower = doc.text.lower() |
| conns = {} |
| for cat, words in self.conn_map.items(): |
| count = 0 |
| for phrase in words: |
| pattern = rf"\b{re.escape(phrase.lower())}\b" |
| count += len(re.findall(pattern, full_text_lower)) |
| conns[cat] = count |
|
|
| wsum = ( |
| (conns.get("AdPos", 0) + conns.get("AdNeg", 0) * 1.5) + |
| (conns.get("CausPos", 0) + conns.get("CausNeg", 0) * 2) + |
| (conns.get("TempPos", 0) + conns.get("TempNeg", 0)) + |
| (conns.get("LogPos", 0) + conns.get("LogNeg", 0) * 1.5) |
| ) |
| valori = list(conns.values()) |
| stand_dev = np.std(valori) if valori else 0 |
|
|
| generalValue = math.log(stand_dev, wsum) if wsum > 0 and wsum != 1 and stand_dev > 0 else 0.0 |
|
|
| return { |
| "connectors": {k: round((v / len(token_text_SP)) * 1000, 2) for k, v in conns.items()}, |
| "general_cohesion": round(generalValue, 4) if wsum > 0 else 0.0, |
| "lexical_cohesion_local": np.mean(lex_scores_2) if lex_scores_2 else 0.0, |
| "lexical_cohesion_global": np.mean(lex_scores_3) if lex_scores_3 else 0.0, |
| "semantic_cohesion_sentences": np.mean(sem_scores_sent) if sem_scores_sent else 0.0, |
| "semantic_cohesion_paragraphs": np.mean(sem_scores_para) if sem_scores_para else 0.0, |
| "stats": { |
| "num_paragraphs": len(paragraphs), |
| "num_sentences": len(sentences) |
| } |
| } |
|
|