File size: 7,337 Bytes
505b73c 1031fc3 505b73c 1031fc3 505b73c 40118dc 505b73c 40118dc 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c ea86ed7 d936547 9242860 d936547 dfbfab7 d936547 9242860 dfbfab7 9242860 d936547 9242860 d936547 dfbfab7 c7f292c 71cfc76 a3659cc 9242860 505b73c a3659cc 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 1031fc3 505b73c 9409d5d 4023977 1031fc3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | import pandas as pd
import en_core_web_sm
import it_core_news_sm
from modules.combined import calcola_clustering_e_score, elabora_e_correla
from modules.lexical import LexicalModule
from modules.syntax import SyntaxModule
from modules.cohesion import CohesionModule
from modules.figurative import FigurativeModule
from modules.consecutio import ConsecutioAnalyzer
from modules.frequencies import FrequenciesModule
class AnalyzerEngine:
def __init__(self):
# Caricamento dei due parser
self.nlp = en_core_web_sm.load()
self.nlp_it = it_core_news_sm.load()
# --- INIZIALIZZAZIONE MODULI LESSICALI (Uno per lingua) ---
# 1. Risorse Inglesi
en_r1 = self._load_set("src/basewrd1.txt")
en_r2 = self._load_set("src/basewrd2.txt")
en_r3 = self._load_set("src/basewrd3.txt")
en_mrc = self._load_mrc("src/MRCPLDatabase.txt")
self.lex_mod_en = LexicalModule(en_r1, en_r2, en_r3, en_mrc)
# 2. Risorse Italiane (Modifica i nomi dei file con i tuoi effettivi)
it_r1 = self._load_set("src/it_basewrd1.txt")
it_r2 = self._load_set("src/it_basewrd2.txt")
it_r3 = self._load_set("src/it_basewrd3.txt")
it_mrc = self._load_mrc("src/ConcreteRating.txt")
self.lex_mod_it = LexicalModule(it_r1, it_r2, it_r3, it_mrc)
# --- ALTRI MODULI (Indipendenti dalla lingua in fase di Init) ---
self.syn_mod = SyntaxModule()
self.coh_mod = CohesionModule()
self.fig_mod = FigurativeModule()
self.consecutio = ConsecutioAnalyzer()
self.freq_mod = FrequenciesModule()
self.last_doc = None
def prepare_filtered_data(self, res, fig_res=None):
rows = []
esclusioni = {
"tense_stability", "avg_depth", "consecution_index","verb_density",
"semantic_cohesion_sentences", "pron_noun_ratio", "definite_articles_ratio",
"demonstratives_ratio", "hapax_ratio", "mds_s", "mds_w", "indice_d_testo",
"r1", "r2", "r3", "concreteness", "deictic_Frequency", "attr_adjs_freq",
"emphatic_particles", "rel_clauses_per_sent", "present_ratio", "participle_ratio", "past_ratio",
"first_person_ratio", "mod_per_noun", "avg_sent_len", "Conn_AdNeg", "Conn_AdPos",
"Conn_CausNeg", "Conn_CausPos", "Conn_LogNeg", "Conn_LogPos", "Conn_TempNeg", "Conn_TempPos"
}
categories = {
"Statistiche Base": res.get("basic", {}),
"Lessico": res.get("lexical", {}),
"Sintassi e Verbi": res.get("syntax", {}),
"Coesione": res.get("cohesion", {}),
"Consecutio": res.get("consecutio", {}),
"Combined": res.get("combined", {})
}
for cat_name, dict_data in categories.items():
for key, value in dict_data.items():
if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]:
if key in esclusioni:
rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value})
connectors = res.get("cohesion", {}).get("connectors", {})
for conn_type, val in connectors.items():
metrica_conn = f"Conn_{conn_type}"
if metrica_conn in esclusioni:
rows.append({"Categoria": "Connettori", "Metrica": metrica_conn, "Valore": val})
bert_metrics = ["mds_s", "mds_w", "total"]
if fig_res:
for m in bert_metrics:
if m in esclusioni:
val = fig_res.get(m, 0)
rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val})
return pd.DataFrame(rows)
def run_combined_analysis(self, res, fig_res, source_name):
df_filtrato = self.prepare_filtered_data(res, fig_res)
df_input = df_filtrato.set_index(["Categoria", "Metrica"])
df_input.columns = [source_name]
q_score, matrice_corr, medie_classi, gs_perc, df_perc_nuovo = elabora_e_correla(df_input, source_name)
q_score, figura, classe_assegnata = calcola_clustering_e_score(q_score, medie_classi, matrice_corr, source_name, gs_perc, df_perc_nuovo)
return q_score, figura, classe_assegnata
def _load_set(self, p):
try:
with open(p, 'r', encoding='utf-8', errors='replace') as f:
words = set()
for line in f:
# Rimuove spazi bianchi all'inizio/fine
clean_line = line.strip()
clean_line = clean_line.replace('\t', '')
if clean_line:
# Prende la prima parte (la parola) ed esclude il numero
word = clean_line.split()[0].lower()
if word:
words.add(word)
return words
except Exception as e:
print(f"Errore caricamento {p}: {e}")
return set()
def _load_mrc(self, p):
d = {}
with open(p, 'r', encoding='utf-8') as f:
for l in f:
pts = l.split('\t')
if len(pts) > 1: d[pts[0].lower()] = int(pts[1])
return d
def run(self, text, lang="en"):
# 1. Applica il parser e seleziona il modulo lessicale corretto
if lang == "it":
doc = self.nlp_it(text)
active_lex_mod = self.lex_mod_it
else:
doc = self.nlp(text)
active_lex_mod = self.lex_mod_en
self.last_doc = doc
sentences = list(doc.sents)
lemmas = [t.lemma_.lower() for t in doc if not t.is_punct]
tokens_text = [t.text.lower() for t in doc if not t.is_punct]
token_text_SP = [t.text.lower() for t in doc]
paragraphs = text.split("\n") if "\n" in text else [text]
# Generazione CoNLL-U per il Tab 3
conll = []
for sent in sentences:
lines = []
for i, t in enumerate(sent):
if t.dep_ == "ROOT" or t.head == t:
head_idx = 0
else:
head_idx = t.head.i - sent.start + 1
line = f"{i+1}\t{t.text}\t{t.lemma_}\t{t.pos_}\t{t.tag_}\t_\t{head_idx}\t{t.dep_}"
lines.append(line)
conll.append("\n".join(lines))
return {
"doc": doc,
"basic": {
"tokens": len(tokens_text),
"tokens_including_punct": len(token_text_SP),
"sentences": len(sentences),
"paragraphs": len(paragraphs),
"chars": len(text),
"texts": [s.text for s in sentences],
"conll": conll
},
# Viene richiamato dinamicamente il modulo lessicale corretto
"lexical": active_lex_mod.analyze(doc, lemmas, tokens_text, token_text_SP, text, sentences, len(paragraphs),lang=lang),
"syntax": self.syn_mod.analyze(doc, sentences,lang=lang),
"cohesion": self.coh_mod.analyze(doc, sentences, tokens_text, token_text_SP,lang=lang),
"consecutio": self.consecutio.analyze(doc),
} |