Spaces:

amaisto
/

CO3

Sleeping

File size: 7,337 Bytes

import pandas as pd
import en_core_web_sm
import it_core_news_sm
from modules.combined import calcola_clustering_e_score, elabora_e_correla
from modules.lexical import LexicalModule
from modules.syntax import SyntaxModule
from modules.cohesion import CohesionModule
from modules.figurative import FigurativeModule
from modules.consecutio import ConsecutioAnalyzer
from modules.frequencies import FrequenciesModule

class AnalyzerEngine:
    def __init__(self):
        # Caricamento dei due parser
        self.nlp = en_core_web_sm.load()
        self.nlp_it = it_core_news_sm.load()
        
        # --- INIZIALIZZAZIONE MODULI LESSICALI (Uno per lingua) ---
        
        # 1. Risorse Inglesi
        en_r1 = self._load_set("src/basewrd1.txt")
        en_r2 = self._load_set("src/basewrd2.txt")
        en_r3 = self._load_set("src/basewrd3.txt")
        en_mrc = self._load_mrc("src/MRCPLDatabase.txt")
        self.lex_mod_en = LexicalModule(en_r1, en_r2, en_r3, en_mrc)

        # 2. Risorse Italiane (Modifica i nomi dei file con i tuoi effettivi)
        it_r1 = self._load_set("src/it_basewrd1.txt")
        it_r2 = self._load_set("src/it_basewrd2.txt")
        it_r3 = self._load_set("src/it_basewrd3.txt")
        it_mrc = self._load_mrc("src/ConcreteRating.txt")
        self.lex_mod_it = LexicalModule(it_r1, it_r2, it_r3, it_mrc)
        
        # --- ALTRI MODULI (Indipendenti dalla lingua in fase di Init) ---
        self.syn_mod = SyntaxModule()
        self.coh_mod = CohesionModule()
        self.fig_mod = FigurativeModule()
        self.consecutio = ConsecutioAnalyzer()
        self.freq_mod = FrequenciesModule()
        
        self.last_doc = None
    
    def prepare_filtered_data(self, res, fig_res=None):
        rows = []
        
        esclusioni = {        
            "tense_stability", "avg_depth", "consecution_index","verb_density",
            "semantic_cohesion_sentences", "pron_noun_ratio", "definite_articles_ratio",
            "demonstratives_ratio", "hapax_ratio", "mds_s", "mds_w", "indice_d_testo",
            "r1", "r2", "r3", "concreteness", "deictic_Frequency", "attr_adjs_freq",
            "emphatic_particles", "rel_clauses_per_sent", "present_ratio", "participle_ratio", "past_ratio", 
            "first_person_ratio", "mod_per_noun", "avg_sent_len", "Conn_AdNeg", "Conn_AdPos",
            "Conn_CausNeg", "Conn_CausPos", "Conn_LogNeg", "Conn_LogPos", "Conn_TempNeg", "Conn_TempPos"
        }
        
        categories = {
            "Statistiche Base": res.get("basic", {}),
            "Lessico": res.get("lexical", {}),
            "Sintassi e Verbi": res.get("syntax", {}),
            "Coesione": res.get("cohesion", {}),
            "Consecutio": res.get("consecutio", {}),
            "Combined": res.get("combined", {})
        }
        
        for cat_name, dict_data in categories.items():
            for key, value in dict_data.items():
                if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]:
                    if key in esclusioni:
                        rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value})
        
        connectors = res.get("cohesion", {}).get("connectors", {})
        for conn_type, val in connectors.items():
            metrica_conn = f"Conn_{conn_type}"
            if metrica_conn in esclusioni:
                rows.append({"Categoria": "Connettori", "Metrica": metrica_conn, "Valore": val})

        bert_metrics = ["mds_s", "mds_w", "total"]
        if fig_res: 
            for m in bert_metrics:
                if m in esclusioni:
                    val = fig_res.get(m, 0)
                    rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val})
            
        return pd.DataFrame(rows)
    
    def run_combined_analysis(self, res, fig_res, source_name):
        df_filtrato = self.prepare_filtered_data(res, fig_res)
        df_input = df_filtrato.set_index(["Categoria", "Metrica"])
        df_input.columns = [source_name]

        q_score, matrice_corr, medie_classi, gs_perc, df_perc_nuovo = elabora_e_correla(df_input, source_name)
        q_score, figura, classe_assegnata = calcola_clustering_e_score(q_score, medie_classi, matrice_corr, source_name, gs_perc, df_perc_nuovo)
        
        return q_score, figura, classe_assegnata

    def _load_set(self, p):
        try:
            with open(p, 'r', encoding='utf-8', errors='replace') as f:
                words = set()
                for line in f:
                    # Rimuove spazi bianchi all'inizio/fine
                    clean_line = line.strip()
                    clean_line = clean_line.replace('\t', '')
                    if clean_line:
                        # Prende la prima parte (la parola) ed esclude il numero
                        word = clean_line.split()[0].lower()
                        if word:
                            words.add(word)
                return words
        except Exception as e:
            print(f"Errore caricamento {p}: {e}")
            return set()

    def _load_mrc(self, p):
        d = {}
        with open(p, 'r', encoding='utf-8') as f:
            for l in f:
                pts = l.split('\t')
                if len(pts) > 1: d[pts[0].lower()] = int(pts[1])
        return d

    def run(self, text, lang="en"):
        # 1. Applica il parser e seleziona il modulo lessicale corretto
        if lang == "it":
            doc = self.nlp_it(text)
            active_lex_mod = self.lex_mod_it
        else:
            doc = self.nlp(text)
            active_lex_mod = self.lex_mod_en

        self.last_doc = doc
        sentences = list(doc.sents)
        lemmas = [t.lemma_.lower() for t in doc if not t.is_punct]
        tokens_text = [t.text.lower() for t in doc if not t.is_punct]
        token_text_SP = [t.text.lower() for t in doc]
        paragraphs = text.split("\n") if "\n" in text else [text]
        
        # Generazione CoNLL-U per il Tab 3
        conll = []
        for sent in sentences:
            lines = []
            for i, t in enumerate(sent):
                if t.dep_ == "ROOT" or t.head == t:
                    head_idx = 0
                else:
                    head_idx = t.head.i - sent.start + 1
                
                line = f"{i+1}\t{t.text}\t{t.lemma_}\t{t.pos_}\t{t.tag_}\t_\t{head_idx}\t{t.dep_}"
                lines.append(line)
                
            conll.append("\n".join(lines))

        return {
            "doc": doc,
            "basic": {
                "tokens": len(tokens_text),
                "tokens_including_punct": len(token_text_SP),
                "sentences": len(sentences),
                "paragraphs": len(paragraphs),
                "chars": len(text), 
                "texts": [s.text for s in sentences], 
                "conll": conll
            },
            # Viene richiamato dinamicamente il modulo lessicale corretto
            "lexical": active_lex_mod.analyze(doc, lemmas, tokens_text, token_text_SP, text, sentences, len(paragraphs),lang=lang),
            "syntax": self.syn_mod.analyze(doc, sentences,lang=lang),
            "cohesion": self.coh_mod.analyze(doc, sentences, tokens_text, token_text_SP,lang=lang),
            "consecutio": self.consecutio.analyze(doc),
        }