import math from collections import Counter import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer import re class FrequenciesModule: def get_pmi(self, lemmas, n=2, min_freq=2, top_n=15): if len(lemmas) < n: return pd.DataFrame() ngrams = list(zip(*[lemmas[i:] for i in range(n)])) ngram_counts = Counter(ngrams) word_counts = Counter(lemmas) tot_words = len(lemmas) tot_ngrams = len(ngrams) res = [] for ng, freq in ngram_counts.items(): if freq >= min_freq: # Calcolo probabilità congiunta p_xy = freq / tot_ngrams # Calcolo probabilità marginali p_x_p_y = math.prod([word_counts[w] / tot_words for w in ng]) pmi = math.log2(p_xy / p_x_p_y) if p_x_p_y > 0 else 0 res.append({"N-gram": " ".join(ng), "PMI": pmi, "Freq": freq}) df = pd.DataFrame(res) if not df.empty: return df.sort_values("PMI", ascending=False).head(top_n) return df def chunk_doc(self, text, doc, mode="numeric", val=5): chunks_lemmas = [] if mode == "numeric": lemmas = [t.lemma_.lower() for t in doc if not t.is_punct and not t.is_space] try: val = int(val) chunk_size = math.ceil(len(lemmas) / val) if val > 0 else len(lemmas) chunks_lemmas = [lemmas[i:i+chunk_size] for i in range(0, len(lemmas), chunk_size)] except ValueError: chunks_lemmas = [lemmas] elif mode == "regex": # Attenzione: la regex ri-processa il testo approssimativamente per estrarre lemmi veloci parts = re.split(str(val), text) for p in parts: if p.strip(): # Estrazione rapida per non richiamare spacy interamente tokens = re.findall(r'\b\w+\b', p.lower()) chunks_lemmas.append(tokens) # Fallback ai token per velocità se si usa regex grezza return chunks_lemmas def get_trends(self, chunks_lemmas): docs = [" ".join(chunk) for chunk in chunks_lemmas] if not docs: return pd.DataFrame(), pd.DataFrame() # Frequenze Assolute freq_data = [] for i, chunk in enumerate(chunks_lemmas): counts = Counter(chunk) for word, count in counts.items(): freq_data.append({"Part": i+1, "Word": word, "Count": count}) df_freq = pd.DataFrame(freq_data) # TF-IDF vectorizer = TfidfVectorizer() try: tfidf_matrix = vectorizer.fit_transform(docs) df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) df_tfidf["Part"] = range(1, len(docs) + 1) # Melt per renderlo compatibile con Plotly df_tfidf = df_tfidf.melt(id_vars=["Part"], var_name="Word", value_name="TF-IDF") except ValueError: df_tfidf = pd.DataFrame(columns=["Part", "Word", "TF-IDF"]) return df_freq, df_tfidf