File size: 7,337 Bytes
505b73c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1031fc3
505b73c
1031fc3
505b73c
40118dc
 
 
 
505b73c
 
 
40118dc
 
 
 
505b73c
1031fc3
505b73c
 
 
 
 
 
1031fc3
505b73c
1031fc3
505b73c
 
1031fc3
 
505b73c
 
 
 
1031fc3
505b73c
 
 
1031fc3
505b73c
 
 
 
 
 
 
 
1031fc3
505b73c
 
 
 
 
1031fc3
505b73c
 
 
 
 
 
 
1031fc3
505b73c
 
 
 
1031fc3
505b73c
1031fc3
505b73c
 
 
 
 
 
 
1031fc3
505b73c
 
ea86ed7
d936547
9242860
 
d936547
dfbfab7
d936547
9242860
 
dfbfab7
9242860
d936547
 
9242860
d936547
dfbfab7
c7f292c
71cfc76
 
 
a3659cc
 
 
 
9242860
505b73c
a3659cc
505b73c
 
 
 
 
 
 
 
 
 
 
 
 
 
1031fc3
505b73c
 
 
 
 
 
 
 
 
1031fc3
505b73c
 
1031fc3
505b73c
 
 
 
 
 
 
 
 
1031fc3
 
505b73c
 
9409d5d
 
 
 
4023977
1031fc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import pandas as pd
import en_core_web_sm
import it_core_news_sm
from modules.combined import calcola_clustering_e_score, elabora_e_correla
from modules.lexical import LexicalModule
from modules.syntax import SyntaxModule
from modules.cohesion import CohesionModule
from modules.figurative import FigurativeModule
from modules.consecutio import ConsecutioAnalyzer
from modules.frequencies import FrequenciesModule

class AnalyzerEngine:
    def __init__(self):
        # Caricamento dei due parser
        self.nlp = en_core_web_sm.load()
        self.nlp_it = it_core_news_sm.load()
        
        # --- INIZIALIZZAZIONE MODULI LESSICALI (Uno per lingua) ---
        
        # 1. Risorse Inglesi
        en_r1 = self._load_set("src/basewrd1.txt")
        en_r2 = self._load_set("src/basewrd2.txt")
        en_r3 = self._load_set("src/basewrd3.txt")
        en_mrc = self._load_mrc("src/MRCPLDatabase.txt")
        self.lex_mod_en = LexicalModule(en_r1, en_r2, en_r3, en_mrc)

        # 2. Risorse Italiane (Modifica i nomi dei file con i tuoi effettivi)
        it_r1 = self._load_set("src/it_basewrd1.txt")
        it_r2 = self._load_set("src/it_basewrd2.txt")
        it_r3 = self._load_set("src/it_basewrd3.txt")
        it_mrc = self._load_mrc("src/ConcreteRating.txt")
        self.lex_mod_it = LexicalModule(it_r1, it_r2, it_r3, it_mrc)
        
        # --- ALTRI MODULI (Indipendenti dalla lingua in fase di Init) ---
        self.syn_mod = SyntaxModule()
        self.coh_mod = CohesionModule()
        self.fig_mod = FigurativeModule()
        self.consecutio = ConsecutioAnalyzer()
        self.freq_mod = FrequenciesModule()
        
        self.last_doc = None
    
    def prepare_filtered_data(self, res, fig_res=None):
        rows = []
        
        esclusioni = {        
            "tense_stability", "avg_depth", "consecution_index","verb_density",
            "semantic_cohesion_sentences", "pron_noun_ratio", "definite_articles_ratio",
            "demonstratives_ratio", "hapax_ratio", "mds_s", "mds_w", "indice_d_testo",
            "r1", "r2", "r3", "concreteness", "deictic_Frequency", "attr_adjs_freq",
            "emphatic_particles", "rel_clauses_per_sent", "present_ratio", "participle_ratio", "past_ratio", 
            "first_person_ratio", "mod_per_noun", "avg_sent_len", "Conn_AdNeg", "Conn_AdPos",
            "Conn_CausNeg", "Conn_CausPos", "Conn_LogNeg", "Conn_LogPos", "Conn_TempNeg", "Conn_TempPos"
        }
        
        categories = {
            "Statistiche Base": res.get("basic", {}),
            "Lessico": res.get("lexical", {}),
            "Sintassi e Verbi": res.get("syntax", {}),
            "Coesione": res.get("cohesion", {}),
            "Consecutio": res.get("consecutio", {}),
            "Combined": res.get("combined", {})
        }
        
        for cat_name, dict_data in categories.items():
            for key, value in dict_data.items():
                if isinstance(value, (int, float, str)) and key not in ["texts", "conll", "doc"]:
                    if key in esclusioni:
                        rows.append({"Categoria": cat_name, "Metrica": key, "Valore": value})
        
        connectors = res.get("cohesion", {}).get("connectors", {})
        for conn_type, val in connectors.items():
            metrica_conn = f"Conn_{conn_type}"
            if metrica_conn in esclusioni:
                rows.append({"Categoria": "Connettori", "Metrica": metrica_conn, "Valore": val})

        bert_metrics = ["mds_s", "mds_w", "total"]
        if fig_res: 
            for m in bert_metrics:
                if m in esclusioni:
                    val = fig_res.get(m, 0)
                    rows.append({"Categoria": "Figuratività (BERT)", "Metrica": m, "Valore": val})
            
        return pd.DataFrame(rows)
    
    def run_combined_analysis(self, res, fig_res, source_name):
        df_filtrato = self.prepare_filtered_data(res, fig_res)
        df_input = df_filtrato.set_index(["Categoria", "Metrica"])
        df_input.columns = [source_name]

        q_score, matrice_corr, medie_classi, gs_perc, df_perc_nuovo = elabora_e_correla(df_input, source_name)
        q_score, figura, classe_assegnata = calcola_clustering_e_score(q_score, medie_classi, matrice_corr, source_name, gs_perc, df_perc_nuovo)
        
        return q_score, figura, classe_assegnata

    def _load_set(self, p):
        try:
            with open(p, 'r', encoding='utf-8', errors='replace') as f:
                words = set()
                for line in f:
                    # Rimuove spazi bianchi all'inizio/fine
                    clean_line = line.strip()
                    clean_line = clean_line.replace('\t', '')
                    if clean_line:
                        # Prende la prima parte (la parola) ed esclude il numero
                        word = clean_line.split()[0].lower()
                        if word:
                            words.add(word)
                return words
        except Exception as e:
            print(f"Errore caricamento {p}: {e}")
            return set()

    def _load_mrc(self, p):
        d = {}
        with open(p, 'r', encoding='utf-8') as f:
            for l in f:
                pts = l.split('\t')
                if len(pts) > 1: d[pts[0].lower()] = int(pts[1])
        return d

    def run(self, text, lang="en"):
        # 1. Applica il parser e seleziona il modulo lessicale corretto
        if lang == "it":
            doc = self.nlp_it(text)
            active_lex_mod = self.lex_mod_it
        else:
            doc = self.nlp(text)
            active_lex_mod = self.lex_mod_en

        self.last_doc = doc
        sentences = list(doc.sents)
        lemmas = [t.lemma_.lower() for t in doc if not t.is_punct]
        tokens_text = [t.text.lower() for t in doc if not t.is_punct]
        token_text_SP = [t.text.lower() for t in doc]
        paragraphs = text.split("\n") if "\n" in text else [text]
        
        # Generazione CoNLL-U per il Tab 3
        conll = []
        for sent in sentences:
            lines = []
            for i, t in enumerate(sent):
                if t.dep_ == "ROOT" or t.head == t:
                    head_idx = 0
                else:
                    head_idx = t.head.i - sent.start + 1
                
                line = f"{i+1}\t{t.text}\t{t.lemma_}\t{t.pos_}\t{t.tag_}\t_\t{head_idx}\t{t.dep_}"
                lines.append(line)
                
            conll.append("\n".join(lines))

        return {
            "doc": doc,
            "basic": {
                "tokens": len(tokens_text),
                "tokens_including_punct": len(token_text_SP),
                "sentences": len(sentences),
                "paragraphs": len(paragraphs),
                "chars": len(text), 
                "texts": [s.text for s in sentences], 
                "conll": conll
            },
            # Viene richiamato dinamicamente il modulo lessicale corretto
            "lexical": active_lex_mod.analyze(doc, lemmas, tokens_text, token_text_SP, text, sentences, len(paragraphs),lang=lang),
            "syntax": self.syn_mod.analyze(doc, sentences,lang=lang),
            "cohesion": self.coh_mod.analyze(doc, sentences, tokens_text, token_text_SP,lang=lang),
            "consecutio": self.consecutio.analyze(doc),
        }