Spaces:
Sleeping
Sleeping
Update preprocessor.py
Browse files- preprocessor.py +103 -62
preprocessor.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
import pandas as pd
|
| 3 |
# from sentiment_train import predict_sentiment
|
| 4 |
-
from sentiment import
|
| 5 |
import spacy
|
| 6 |
from langdetect import detect, LangDetectException
|
| 7 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
@@ -72,64 +72,52 @@ def preprocess_for_clustering(df, n_clusters=5):
|
|
| 72 |
|
| 73 |
return df, reduced_features, kmeans.cluster_centers_
|
| 74 |
|
| 75 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
#
|
| 91 |
-
df =
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
messages.append(match.group(2))
|
| 104 |
-
else:
|
| 105 |
-
users.append("group_notification")
|
| 106 |
-
messages.append(message)
|
| 107 |
-
|
| 108 |
-
df["user"] = users
|
| 109 |
-
df["message"] = messages
|
| 110 |
df = df[df["user"] != "group_notification"]
|
| 111 |
df.reset_index(drop=True, inplace=True)
|
| 112 |
|
| 113 |
-
|
| 114 |
df["unfiltered_messages"] = df["message"]
|
| 115 |
# Clean messages
|
| 116 |
df["message"] = df["message"].apply(clean_message)
|
| 117 |
-
|
| 118 |
-
# Filter and lemmatize messages
|
| 119 |
-
lemmatized_messages = []
|
| 120 |
-
for message in df["message"]:
|
| 121 |
-
try:
|
| 122 |
-
lang = detect(message)
|
| 123 |
-
lemmatized_messages.append(lemmatize_text(message, lang))
|
| 124 |
-
except LangDetectException:
|
| 125 |
-
lemmatized_messages.append("")
|
| 126 |
-
|
| 127 |
-
df["lemmatized_message"] = lemmatized_messages
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# Drop original column
|
| 131 |
-
df.drop(columns=["user_message"], inplace=True)
|
| 132 |
-
|
| 133 |
# Extract time-based features
|
| 134 |
df['year'] = df['date'].dt.year
|
| 135 |
df['month'] = df['date'].dt.month_name()
|
|
@@ -138,16 +126,67 @@ def preprocess(data):
|
|
| 138 |
df['day_of_week'] = df['date'].dt.day_name()
|
| 139 |
df['minute'] = df['date'].dt.minute
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
# Apply sentiment analysis
|
| 142 |
-
|
| 143 |
-
|
| 144 |
|
| 145 |
# Filter out rows with null lemmatized_message
|
| 146 |
-
|
| 147 |
|
| 148 |
# **Fix: Use a custom stop word list**
|
| 149 |
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
# Apply LDA
|
| 153 |
lda = LatentDirichletAllocation(n_components=5, random_state=42)
|
|
@@ -155,15 +194,17 @@ def preprocess(data):
|
|
| 155 |
|
| 156 |
# Assign topics to messages
|
| 157 |
topic_results = lda.transform(dtm)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
|
| 161 |
# Store topics for visualization
|
| 162 |
topics = []
|
| 163 |
for topic in lda.components_:
|
| 164 |
topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
|
| 165 |
-
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import pandas as pd
|
| 3 |
# from sentiment_train import predict_sentiment
|
| 4 |
+
from sentiment import predict_sentiment_bert_batch
|
| 5 |
import spacy
|
| 6 |
from langdetect import detect, LangDetectException
|
| 7 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
| 72 |
|
| 73 |
return df, reduced_features, kmeans.cluster_centers_
|
| 74 |
|
| 75 |
+
def parse_data(data):
|
| 76 |
+
"""
|
| 77 |
+
Parses the raw chat data into a DataFrame and performs basic cleaning.
|
| 78 |
+
"""
|
| 79 |
+
# Optimization: Use pandas vectorized string operations instead of looping
|
| 80 |
+
|
| 81 |
+
# Split lines
|
| 82 |
+
lines = data.strip().split("\n")
|
| 83 |
+
df = pd.DataFrame({'line': lines})
|
| 84 |
+
|
| 85 |
+
# Extract Date, Time, Sender, Message using regex
|
| 86 |
pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
|
| 87 |
+
|
| 88 |
+
extracted = df['line'].str.extract(pattern)
|
| 89 |
+
|
| 90 |
+
# Drop lines that didn't match (if any)
|
| 91 |
+
extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
|
| 92 |
+
|
| 93 |
+
# Combine Date and Time
|
| 94 |
+
extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False)
|
| 95 |
+
extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
|
| 96 |
+
|
| 97 |
+
# Handle Sender
|
| 98 |
+
extracted['Sender'] = extracted['Sender'].fillna('group_notification')
|
| 99 |
+
|
| 100 |
+
# Rename columns
|
| 101 |
+
df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
|
| 102 |
+
|
| 103 |
+
# Filter out system messages
|
| 104 |
+
df = df[df['user'].str.lower() != 'system']
|
| 105 |
+
|
| 106 |
+
# Convert date
|
| 107 |
+
df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
|
| 108 |
+
|
| 109 |
+
# Filter out invalid dates
|
| 110 |
+
df = df.dropna(subset=['date'])
|
| 111 |
+
|
| 112 |
+
# Filter out group notifications
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
df = df[df["user"] != "group_notification"]
|
| 114 |
df.reset_index(drop=True, inplace=True)
|
| 115 |
|
| 116 |
+
# unfiltered messages
|
| 117 |
df["unfiltered_messages"] = df["message"]
|
| 118 |
# Clean messages
|
| 119 |
df["message"] = df["message"].apply(clean_message)
|
| 120 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# Extract time-based features
|
| 122 |
df['year'] = df['date'].dt.year
|
| 123 |
df['month'] = df['date'].dt.month_name()
|
|
|
|
| 126 |
df['day_of_week'] = df['date'].dt.day_name()
|
| 127 |
df['minute'] = df['date'].dt.minute
|
| 128 |
|
| 129 |
+
period = []
|
| 130 |
+
for hour in df['hour']:
|
| 131 |
+
if hour == 23:
|
| 132 |
+
period.append(str(hour) + "-" + str('00'))
|
| 133 |
+
elif hour == 0:
|
| 134 |
+
period.append(str('00') + "-" + str(hour + 1))
|
| 135 |
+
else:
|
| 136 |
+
period.append(str(hour) + "-" + str(hour + 1))
|
| 137 |
+
|
| 138 |
+
df['period'] = period
|
| 139 |
+
|
| 140 |
+
return df
|
| 141 |
+
|
| 142 |
+
def analyze_sentiment_and_topics(df):
|
| 143 |
+
"""
|
| 144 |
+
Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
|
| 145 |
+
Includes sampling for large datasets.
|
| 146 |
+
"""
|
| 147 |
+
# Sampling Logic: Cap at 5000 messages for deep analysis
|
| 148 |
+
original_df_len = len(df)
|
| 149 |
+
if len(df) > 5000:
|
| 150 |
+
print(f"Sampling 5000 messages from {len(df)}...")
|
| 151 |
+
# We keep the original index to potentially map back, but for now we just work on the sample
|
| 152 |
+
df_sample = df.sample(5000, random_state=42).copy()
|
| 153 |
+
else:
|
| 154 |
+
df_sample = df.copy()
|
| 155 |
+
|
| 156 |
+
# Filter and lemmatize messages
|
| 157 |
+
lemmatized_messages = []
|
| 158 |
+
# Optimization: Detect dominant language on a sample
|
| 159 |
+
sample_size = min(len(df_sample), 500)
|
| 160 |
+
sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
|
| 161 |
+
try:
|
| 162 |
+
dominant_lang = detect(sample_text)
|
| 163 |
+
except LangDetectException:
|
| 164 |
+
dominant_lang = 'en'
|
| 165 |
+
|
| 166 |
+
nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
|
| 167 |
+
|
| 168 |
+
# Use nlp.pipe for batch processing
|
| 169 |
+
lemmatized_messages = []
|
| 170 |
+
for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
|
| 171 |
+
lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))
|
| 172 |
+
|
| 173 |
+
df_sample["lemmatized_message"] = lemmatized_messages
|
| 174 |
+
|
| 175 |
# Apply sentiment analysis
|
| 176 |
+
# Use batch processing for speed
|
| 177 |
+
df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)
|
| 178 |
|
| 179 |
# Filter out rows with null lemmatized_message
|
| 180 |
+
df_sample = df_sample.dropna(subset=['lemmatized_message'])
|
| 181 |
|
| 182 |
# **Fix: Use a custom stop word list**
|
| 183 |
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
|
| 184 |
+
try:
|
| 185 |
+
dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
|
| 186 |
+
except ValueError:
|
| 187 |
+
# Handle case where vocabulary is empty (e.g. all stop words)
|
| 188 |
+
print("Warning: Empty vocabulary after filtering. Returning empty topics.")
|
| 189 |
+
return df_sample, []
|
| 190 |
|
| 191 |
# Apply LDA
|
| 192 |
lda = LatentDirichletAllocation(n_components=5, random_state=42)
|
|
|
|
| 194 |
|
| 195 |
# Assign topics to messages
|
| 196 |
topic_results = lda.transform(dtm)
|
| 197 |
+
df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
|
| 198 |
+
df_sample['topic'] = topic_results.argmax(axis=1)
|
| 199 |
|
| 200 |
# Store topics for visualization
|
| 201 |
topics = []
|
| 202 |
for topic in lda.components_:
|
| 203 |
topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
|
|
|
|
| 204 |
|
| 205 |
+
# If we sampled, we return the sampled dataframe with sentiment/topics.
|
| 206 |
+
# The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
|
| 207 |
+
# Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
|
| 208 |
+
# For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
|
| 209 |
+
|
| 210 |
+
return df_sample, topics
|