SocialMediaFoci / preprocessor.py
Bismark
Update Space
5ab54b7
import re
import pandas as pd
# from sentiment_train import predict_sentiment
from sentiment import predict_sentiment_bert_batch
import spacy
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
# Load language models
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
# Merge English and French stop words
custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))
def lemmatize_text(text, lang):
if lang == 'fr':
doc = nlp_fr(text)
else:
doc = nlp_en(text)
return " ".join([token.lemma_ for token in doc if not token.is_punct])
def clean_message(text):
""" Remove media notifications, special characters, and unwanted symbols. """
if not isinstance(text, str):
return ""
text = text.lower() # Convert to lowercase
text = re.sub(r"<media omitted>", "", text) # Remove media notifications
text = re.sub(r"this message was deleted", "", text)
text = re.sub(r"null", "", text)
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) # Remove links
text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text) # Remove special characters
return text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import numpy as np
def preprocess_for_clustering(df, n_clusters=5):
"""
Preprocess messages for clustering.
Args:
df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column.
n_clusters (int): Number of clusters to create.
Returns:
df (pd.DataFrame): DataFrame with added 'cluster' column.
cluster_centers (np.array): Cluster centroids.
"""
# Step 1: Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])
# Step 2: Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
# Step 3: Add cluster labels to DataFrame
df['cluster'] = clusters
# Step 4: Reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
reduced_features = tsne.fit_transform(tfidf_matrix.toarray())
return df, reduced_features, kmeans.cluster_centers_
def parse_data(data):
"""
Parses the raw chat data into a DataFrame and performs basic cleaning.
"""
# Optimization: Use pandas vectorized string operations instead of looping
# Split lines
lines = data.strip().split("\n")
df = pd.DataFrame({'line': lines})
# Extract Date, Time, Sender, Message using regex
pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
extracted = df['line'].str.extract(pattern)
# Drop lines that didn't match (if any)
extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
# Combine Date and Time
extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False)
extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
# Handle Sender
extracted['Sender'] = extracted['Sender'].fillna('group_notification')
# Rename columns
df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
# Filter out system messages
df = df[df['user'].str.lower() != 'system']
# Convert date
df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
# Filter out invalid dates
df = df.dropna(subset=['date'])
# Filter out group notifications
df = df[df["user"] != "group_notification"]
df.reset_index(drop=True, inplace=True)
# unfiltered messages
df["unfiltered_messages"] = df["message"]
# Clean messages
df["message"] = df["message"].apply(clean_message)
# Extract time-based features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.day_name()
df['minute'] = df['date'].dt.minute
period = []
for hour in df['hour']:
if hour == 23:
period.append(str(hour) + "-" + str('00'))
elif hour == 0:
period.append(str('00') + "-" + str(hour + 1))
else:
period.append(str(hour) + "-" + str(hour + 1))
df['period'] = period
return df
def analyze_sentiment_and_topics(df):
"""
Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
Includes sampling for large datasets.
"""
# Sampling Logic: Cap at 5000 messages for deep analysis
original_df_len = len(df)
if len(df) > 5000:
print(f"Sampling 5000 messages from {len(df)}...")
# We keep the original index to potentially map back, but for now we just work on the sample
df_sample = df.sample(5000, random_state=42).copy()
else:
df_sample = df.copy()
# Filter and lemmatize messages
lemmatized_messages = []
# Optimization: Detect dominant language on a sample
sample_size = min(len(df_sample), 500)
sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
try:
dominant_lang = detect(sample_text)
except LangDetectException:
dominant_lang = 'en'
nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
# Use nlp.pipe for batch processing
lemmatized_messages = []
for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))
df_sample["lemmatized_message"] = lemmatized_messages
# Apply sentiment analysis
# Use batch processing for speed
df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)
# Filter out rows with null lemmatized_message
df_sample = df_sample.dropna(subset=['lemmatized_message'])
# **Fix: Use a custom stop word list**
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
try:
dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
except ValueError:
# Handle case where vocabulary is empty (e.g. all stop words)
print("Warning: Empty vocabulary after filtering. Returning empty topics.")
return df_sample, []
# Apply LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)
# Assign topics to messages
topic_results = lda.transform(dtm)
df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
df_sample['topic'] = topic_results.argmax(axis=1)
# Store topics for visualization
topics = []
for topic in lda.components_:
topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
# If we sampled, we return the sampled dataframe with sentiment/topics.
# The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
# Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
# For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
return df_sample, topics