CO3 / src /engine.py
amaisto's picture
Update src/engine.py
40118dc verified
import pandas as pd
import en_core_web_sm
import it_core_news_sm
from modules.combined import calcola_clustering_e_score, elabora_e_correla
from modules.lexical import LexicalModule
from modules.syntax import SyntaxModule
from modules.cohesion import CohesionModule
from modules.figurative import FigurativeModule
from modules.consecutio import ConsecutioAnalyzer
from modules.frequencies import FrequenciesModule
class AnalyzerEngine:
def __init__(self):
# Caricamento dei due parser
self.nlp = en_core_web_sm.load()
self.nlp_it = it_core_news_sm.load()
# --- INIZIALIZZAZIONE MODULI LESSICALI (Uno per lingua) ---
# 1. Risorse Inglesi
en_r1 = self._load_set("src/basewrd1.txt")
en_r2 = self._load_set("src/basewrd2.txt")
en_r3 = self._load_set("src/basewrd3.txt")
en_mrc = self._load_mrc("src/MRCPLDatabase.txt")
self.lex_mod_en = LexicalModule(en_r1, en_r2, en_r3, en_mrc)
# 2. Risorse Italiane (Modifica i nomi dei file con i tuoi effettivi)
it_r1 = self._load_set("src/it_basewrd1.txt")
it_r2 = self._load_set("src/it_basewrd2.txt")
it_r3 = self._load_set("src/it_basewrd3.txt")
it_mrc = self._load_mrc("src/ConcreteRating.txt")
self.lex_mod_it = LexicalModule(it_r1, it_r2, it_r3, it_mrc)
# --- ALTRI MODULI (Indipendenti dalla lingua in fase di Init) ---
self.syn_mod = SyntaxModule()
self.coh_mod = CohesionModule()
self.fig_mod = FigurativeModule()
self.consecutio = ConsecutioAnalyzer()
self.freq_mod = FrequenciesModule()
self.last_doc = None
def prepare_filtered_data(self, res, fig_res=None):
rows = []
esclusioni = {
"tense_stability", "avg_depth", "consecution_index","verb_density",
"semantic_cohesion_sentences", "pron_noun_ratio", "definite_articles_ratio",
"demonstratives_ratio", "hapax_ratio", "mds_s", "mds_w", "indice_d_testo",
"r1", "r2", "r3", "concreteness", "deictic_Frequency", "attr_adjs_freq",
"emphatic_particles", "rel_clauses_per_sent", "present_ratio", "participle_ratio", "past_ratio",
"first_person_ratio", "mod_per_noun", "avg_sent_len", "Conn_AdNeg", "Conn_AdPos",
"Conn_CausNeg", "Conn_CausPos", "Conn_LogNeg", "Conn_LogPos", "Conn_TempNeg", "Conn_TempPos"
}
categories = {
"Statistiche Base": res.get("basic", {}),
"Lessico": res.get("lexical", {}),
"Sintassi e Verbi": res.get("syntax", {}),
"Coesione": res.get("cohesion", {}),
"Consecutio": res.get("consecutio", {}),
"Combined": res.get("combined", {})
}
for cat_name, dict_data in categories.items():
for key, value in dict_data.items():
if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]:
if key in esclusioni:
rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value})
connectors = res.get("cohesion", {}).get("connectors", {})
for conn_type, val in connectors.items():
metrica_conn = f"Conn_{conn_type}"
if metrica_conn in esclusioni:
rows.append({"Categoria": "Connettori", "Metrica": metrica_conn, "Valore": val})
bert_metrics = ["mds_s", "mds_w", "total"]
if fig_res:
for m in bert_metrics:
if m in esclusioni:
val = fig_res.get(m, 0)
rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val})
return pd.DataFrame(rows)
def run_combined_analysis(self, res, fig_res, source_name):
df_filtrato = self.prepare_filtered_data(res, fig_res)
df_input = df_filtrato.set_index(["Categoria", "Metrica"])
df_input.columns = [source_name]
q_score, matrice_corr, medie_classi, gs_perc, df_perc_nuovo = elabora_e_correla(df_input, source_name)
q_score, figura, classe_assegnata = calcola_clustering_e_score(q_score, medie_classi, matrice_corr, source_name, gs_perc, df_perc_nuovo)
return q_score, figura, classe_assegnata
def _load_set(self, p):
try:
with open(p, 'r', encoding='utf-8', errors='replace') as f:
words = set()
for line in f:
# Rimuove spazi bianchi all'inizio/fine
clean_line = line.strip()
clean_line = clean_line.replace('\t', '')
if clean_line:
# Prende la prima parte (la parola) ed esclude il numero
word = clean_line.split()[0].lower()
if word:
words.add(word)
return words
except Exception as e:
print(f"Errore caricamento {p}: {e}")
return set()
def _load_mrc(self, p):
d = {}
with open(p, 'r', encoding='utf-8') as f:
for l in f:
pts = l.split('\t')
if len(pts) > 1: d[pts[0].lower()] = int(pts[1])
return d
def run(self, text, lang="en"):
# 1. Applica il parser e seleziona il modulo lessicale corretto
if lang == "it":
doc = self.nlp_it(text)
active_lex_mod = self.lex_mod_it
else:
doc = self.nlp(text)
active_lex_mod = self.lex_mod_en
self.last_doc = doc
sentences = list(doc.sents)
lemmas = [t.lemma_.lower() for t in doc if not t.is_punct]
tokens_text = [t.text.lower() for t in doc if not t.is_punct]
token_text_SP = [t.text.lower() for t in doc]
paragraphs = text.split("\n") if "\n" in text else [text]
# Generazione CoNLL-U per il Tab 3
conll = []
for sent in sentences:
lines = []
for i, t in enumerate(sent):
if t.dep_ == "ROOT" or t.head == t:
head_idx = 0
else:
head_idx = t.head.i - sent.start + 1
line = f"{i+1}\t{t.text}\t{t.lemma_}\t{t.pos_}\t{t.tag_}\t_\t{head_idx}\t{t.dep_}"
lines.append(line)
conll.append("\n".join(lines))
return {
"doc": doc,
"basic": {
"tokens": len(tokens_text),
"tokens_including_punct": len(token_text_SP),
"sentences": len(sentences),
"paragraphs": len(paragraphs),
"chars": len(text),
"texts": [s.text for s in sentences],
"conll": conll
},
# Viene richiamato dinamicamente il modulo lessicale corretto
"lexical": active_lex_mod.analyze(doc, lemmas, tokens_text, token_text_SP, text, sentences, len(paragraphs),lang=lang),
"syntax": self.syn_mod.analyze(doc, sentences,lang=lang),
"cohesion": self.coh_mod.analyze(doc, sentences, tokens_text, token_text_SP,lang=lang),
"consecutio": self.consecutio.analyze(doc),
}