| import math
|
| from collections import Counter
|
| import pandas as pd
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| import re
|
|
|
| class FrequenciesModule:
|
| def get_pmi(self, lemmas, n=2, min_freq=2, top_n=15):
|
| if len(lemmas) < n:
|
| return pd.DataFrame()
|
|
|
| ngrams = list(zip(*[lemmas[i:] for i in range(n)]))
|
| ngram_counts = Counter(ngrams)
|
| word_counts = Counter(lemmas)
|
| tot_words = len(lemmas)
|
| tot_ngrams = len(ngrams)
|
|
|
| res = []
|
| for ng, freq in ngram_counts.items():
|
| if freq >= min_freq:
|
|
|
| p_xy = freq / tot_ngrams
|
|
|
| p_x_p_y = math.prod([word_counts[w] / tot_words for w in ng])
|
|
|
| pmi = math.log2(p_xy / p_x_p_y) if p_x_p_y > 0 else 0
|
| res.append({"N-gram": " ".join(ng), "PMI": pmi, "Freq": freq})
|
|
|
| df = pd.DataFrame(res)
|
| if not df.empty:
|
| return df.sort_values("PMI", ascending=False).head(top_n)
|
| return df
|
|
|
| def chunk_doc(self, text, doc, mode="numeric", val=5):
|
| chunks_lemmas = []
|
| if mode == "numeric":
|
| lemmas = [t.lemma_.lower() for t in doc if not t.is_punct and not t.is_space]
|
| try:
|
| val = int(val)
|
| chunk_size = math.ceil(len(lemmas) / val) if val > 0 else len(lemmas)
|
| chunks_lemmas = [lemmas[i:i+chunk_size] for i in range(0, len(lemmas), chunk_size)]
|
| except ValueError:
|
| chunks_lemmas = [lemmas]
|
| elif mode == "regex":
|
|
|
| parts = re.split(str(val), text)
|
| for p in parts:
|
| if p.strip():
|
|
|
| tokens = re.findall(r'\b\w+\b', p.lower())
|
| chunks_lemmas.append(tokens)
|
| return chunks_lemmas
|
|
|
| def get_trends(self, chunks_lemmas):
|
| docs = [" ".join(chunk) for chunk in chunks_lemmas]
|
| if not docs:
|
| return pd.DataFrame(), pd.DataFrame()
|
|
|
|
|
| freq_data = []
|
| for i, chunk in enumerate(chunks_lemmas):
|
| counts = Counter(chunk)
|
| for word, count in counts.items():
|
| freq_data.append({"Part": i+1, "Word": word, "Count": count})
|
| df_freq = pd.DataFrame(freq_data)
|
|
|
|
|
| vectorizer = TfidfVectorizer()
|
| try:
|
| tfidf_matrix = vectorizer.fit_transform(docs)
|
| df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
|
| df_tfidf["Part"] = range(1, len(docs) + 1)
|
|
|
| df_tfidf = df_tfidf.melt(id_vars=["Part"], var_name="Word", value_name="TF-IDF")
|
| except ValueError:
|
| df_tfidf = pd.DataFrame(columns=["Part", "Word", "TF-IDF"])
|
|
|
| return df_freq, df_tfidf |