CO3 / src /modules /frequencies.py
amaisto's picture
Upload 7 files
b47539a verified
import math
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
class FrequenciesModule:
def get_pmi(self, lemmas, n=2, min_freq=2, top_n=15):
if len(lemmas) < n:
return pd.DataFrame()
ngrams = list(zip(*[lemmas[i:] for i in range(n)]))
ngram_counts = Counter(ngrams)
word_counts = Counter(lemmas)
tot_words = len(lemmas)
tot_ngrams = len(ngrams)
res = []
for ng, freq in ngram_counts.items():
if freq >= min_freq:
# Calcolo probabilità congiunta
p_xy = freq / tot_ngrams
# Calcolo probabilità marginali
p_x_p_y = math.prod([word_counts[w] / tot_words for w in ng])
pmi = math.log2(p_xy / p_x_p_y) if p_x_p_y > 0 else 0
res.append({"N-gram": " ".join(ng), "PMI": pmi, "Freq": freq})
df = pd.DataFrame(res)
if not df.empty:
return df.sort_values("PMI", ascending=False).head(top_n)
return df
def chunk_doc(self, text, doc, mode="numeric", val=5):
chunks_lemmas = []
if mode == "numeric":
lemmas = [t.lemma_.lower() for t in doc if not t.is_punct and not t.is_space]
try:
val = int(val)
chunk_size = math.ceil(len(lemmas) / val) if val > 0 else len(lemmas)
chunks_lemmas = [lemmas[i:i+chunk_size] for i in range(0, len(lemmas), chunk_size)]
except ValueError:
chunks_lemmas = [lemmas]
elif mode == "regex":
# Attenzione: la regex ri-processa il testo approssimativamente per estrarre lemmi veloci
parts = re.split(str(val), text)
for p in parts:
if p.strip():
# Estrazione rapida per non richiamare spacy interamente
tokens = re.findall(r'\b\w+\b', p.lower())
chunks_lemmas.append(tokens) # Fallback ai token per velocità se si usa regex grezza
return chunks_lemmas
def get_trends(self, chunks_lemmas):
docs = [" ".join(chunk) for chunk in chunks_lemmas]
if not docs:
return pd.DataFrame(), pd.DataFrame()
# Frequenze Assolute
freq_data = []
for i, chunk in enumerate(chunks_lemmas):
counts = Counter(chunk)
for word, count in counts.items():
freq_data.append({"Part": i+1, "Word": word, "Count": count})
df_freq = pd.DataFrame(freq_data)
# TF-IDF
vectorizer = TfidfVectorizer()
try:
tfidf_matrix = vectorizer.fit_transform(docs)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf["Part"] = range(1, len(docs) + 1)
# Melt per renderlo compatibile con Plotly
df_tfidf = df_tfidf.melt(id_vars=["Part"], var_name="Word", value_name="TF-IDF")
except ValueError:
df_tfidf = pd.DataFrame(columns=["Part", "Word", "TF-IDF"])
return df_freq, df_tfidf