Spaces:

hansche
/

SocialMediaFoci

Sleeping

SocialMediaFoci / preprocessor.py

Bismark

Update Space

5ab54b7 2 months ago

7.89 kB

	import re
	import pandas as pd
	# from sentiment_train import predict_sentiment
	from sentiment import predict_sentiment_bert_batch
	import spacy
	from langdetect import detect, LangDetectException
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
	from spacy.lang.fr.stop_words import STOP_WORDS as FRENCH_STOP_WORDS
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE
	import numpy as np

	# Load language models
	nlp_fr = spacy.load("fr_core_news_sm")
	nlp_en = spacy.load("en_core_web_sm")

	# Merge English and French stop words
	custom_stop_words = list(ENGLISH_STOP_WORDS.union(FRENCH_STOP_WORDS))

	def lemmatize_text(text, lang):
	if lang == 'fr':
	doc = nlp_fr(text)
	else:
	doc = nlp_en(text)
	return " ".join([token.lemma_ for token in doc if not token.is_punct])

	def clean_message(text):
	""" Remove media notifications, special characters, and unwanted symbols. """
	if not isinstance(text, str):
	return ""
	text = text.lower() # Convert to lowercase
	text = re.sub(r"<media omitted>", "", text) # Remove media notifications
	text = re.sub(r"this message was deleted", "", text)
	text = re.sub(r"null", "", text)

	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text, flags=re.MULTILINE) # Remove links
	text = re.sub(r"[^a-zA-ZÀ-ÿ0-9\s]", "", text) # Remove special characters
	return text

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE
	import numpy as np

	def preprocess_for_clustering(df, n_clusters=5):
	"""
	Preprocess messages for clustering.
	Args:
	df (pd.DataFrame): DataFrame containing the 'lemmatized_message' column.
	n_clusters (int): Number of clusters to create.
	Returns:
	df (pd.DataFrame): DataFrame with added 'cluster' column.
	cluster_centers (np.array): Cluster centroids.
	"""
	# Step 1: Vectorize text using TF-IDF
	vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
	tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message'])

	# Step 2: Apply K-Means clustering
	kmeans = KMeans(n_clusters=n_clusters, random_state=42)
	clusters = kmeans.fit_predict(tfidf_matrix)

	# Step 3: Add cluster labels to DataFrame
	df['cluster'] = clusters

	# Step 4: Reduce dimensionality for visualization
	tsne = TSNE(n_components=2, random_state=42)
	reduced_features = tsne.fit_transform(tfidf_matrix.toarray())

	return df, reduced_features, kmeans.cluster_centers_

	def parse_data(data):
	"""
	Parses the raw chat data into a DataFrame and performs basic cleaning.
	"""
	# Optimization: Use pandas vectorized string operations instead of looping

	# Split lines
	lines = data.strip().split("\n")
	df = pd.DataFrame({'line': lines})

	# Extract Date, Time, Sender, Message using regex
	pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.?):\s+)?(?P<Message>.*)$"

	extracted = df['line'].str.extract(pattern)

	# Drop lines that didn't match (if any)
	extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])

	# Combine Date and Time
	extracted['Time'] = extracted['Time'].str.replace('â€¯', ' ', regex=False)
	extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']

	# Handle Sender
	extracted['Sender'] = extracted['Sender'].fillna('group_notification')

	# Rename columns
	df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})

	# Filter out system messages
	df = df[df['user'].str.lower() != 'system']

	# Convert date
	df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')

	# Filter out invalid dates
	df = df.dropna(subset=['date'])

	# Filter out group notifications
	df = df[df["user"] != "group_notification"]
	df.reset_index(drop=True, inplace=True)

	# unfiltered messages
	df["unfiltered_messages"] = df["message"]
	# Clean messages
	df["message"] = df["message"].apply(clean_message)

	# Extract time-based features
	df['year'] = df['date'].dt.year
	df['month'] = df['date'].dt.month_name()
	df['day'] = df['date'].dt.day
	df['hour'] = df['date'].dt.hour
	df['day_of_week'] = df['date'].dt.day_name()
	df['minute'] = df['date'].dt.minute

	period = []
	for hour in df['hour']:
	if hour == 23:
	period.append(str(hour) + "-" + str('00'))
	elif hour == 0:
	period.append(str('00') + "-" + str(hour + 1))
	else:
	period.append(str(hour) + "-" + str(hour + 1))

	df['period'] = period

	return df

	def analyze_sentiment_and_topics(df):
	"""
	Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
	Includes sampling for large datasets.
	"""
	# Sampling Logic: Cap at 5000 messages for deep analysis
	original_df_len = len(df)
	if len(df) > 5000:
	print(f"Sampling 5000 messages from {len(df)}...")
	# We keep the original index to potentially map back, but for now we just work on the sample
	df_sample = df.sample(5000, random_state=42).copy()
	else:
	df_sample = df.copy()

	# Filter and lemmatize messages
	lemmatized_messages = []
	# Optimization: Detect dominant language on a sample
	sample_size = min(len(df_sample), 500)
	sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
	try:
	dominant_lang = detect(sample_text)
	except LangDetectException:
	dominant_lang = 'en'

	nlp = nlp_fr if dominant_lang == 'fr' else nlp_en

	# Use nlp.pipe for batch processing
	lemmatized_messages = []
	for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
	lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))

	df_sample["lemmatized_message"] = lemmatized_messages

	# Apply sentiment analysis
	# Use batch processing for speed
	df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)

	# Filter out rows with null lemmatized_message
	df_sample = df_sample.dropna(subset=['lemmatized_message'])

	# Fix: Use a custom stop word list
	vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
	try:
	dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
	except ValueError:
	# Handle case where vocabulary is empty (e.g. all stop words)
	print("Warning: Empty vocabulary after filtering. Returning empty topics.")
	return df_sample, []

	# Apply LDA
	lda = LatentDirichletAllocation(n_components=5, random_state=42)
	lda.fit(dtm)

	# Assign topics to messages
	topic_results = lda.transform(dtm)
	df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
	df_sample['topic'] = topic_results.argmax(axis=1)

	# Store topics for visualization
	topics = []
	for topic in lda.components_:
	topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

	# If we sampled, we return the sampled dataframe with sentiment/topics.
	# The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
	# Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
	# For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.

	return df_sample, topics