afanyu237 commited on
Commit
d67c92b
·
verified ·
1 Parent(s): 6cc22fb

Update preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +103 -62
preprocessor.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  import pandas as pd
3
  # from sentiment_train import predict_sentiment
4
- from sentiment import predict_sentiment_bert
5
  import spacy
6
  from langdetect import detect, LangDetectException
7
  from sklearn.feature_extraction.text import CountVectorizer
@@ -72,64 +72,52 @@ def preprocess_for_clustering(df, n_clusters=5):
72
 
73
  return df, reduced_features, kmeans.cluster_centers_
74
 
75
- def preprocess(data):
 
 
 
 
 
 
 
 
 
 
76
  pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
77
-
78
- filtered_messages = []
79
- valid_dates = []
80
-
81
- for line in data.strip().split("\n"):
82
- match = re.match(pattern, line)
83
- if match:
84
- entry = match.groupdict()
85
- sender = entry.get("Sender")
86
- if sender and sender.strip().lower() != "system": # Remove system messages
87
- filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
88
- valid_dates.append(f"{entry['Date']}, {entry['Time'].replace(' ', ' ')}")
89
-
90
- # Create DataFrame
91
- df = pd.DataFrame({'user_message': filtered_messages, 'message_date': valid_dates})
92
- df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
93
- df.rename(columns={'message_date': 'date'}, inplace=True)
94
-
95
- # Separate Users and Messages
96
- users, messages = [], []
97
-
98
- msg_pattern = r"^(.*?):\s(.*)$"
99
- for message in df["user_message"]:
100
- match = re.match(msg_pattern, message)
101
- if match:
102
- users.append(match.group(1))
103
- messages.append(match.group(2))
104
- else:
105
- users.append("group_notification")
106
- messages.append(message)
107
-
108
- df["user"] = users
109
- df["message"] = messages
110
  df = df[df["user"] != "group_notification"]
111
  df.reset_index(drop=True, inplace=True)
112
 
113
- # unfiltered messages
114
  df["unfiltered_messages"] = df["message"]
115
  # Clean messages
116
  df["message"] = df["message"].apply(clean_message)
117
-
118
- # Filter and lemmatize messages
119
- lemmatized_messages = []
120
- for message in df["message"]:
121
- try:
122
- lang = detect(message)
123
- lemmatized_messages.append(lemmatize_text(message, lang))
124
- except LangDetectException:
125
- lemmatized_messages.append("")
126
-
127
- df["lemmatized_message"] = lemmatized_messages
128
-
129
-
130
- # Drop original column
131
- df.drop(columns=["user_message"], inplace=True)
132
-
133
  # Extract time-based features
134
  df['year'] = df['date'].dt.year
135
  df['month'] = df['date'].dt.month_name()
@@ -138,16 +126,67 @@ def preprocess(data):
138
  df['day_of_week'] = df['date'].dt.day_name()
139
  df['minute'] = df['date'].dt.minute
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  # Apply sentiment analysis
142
- half_data = df.head(len(df) // 2) # Select first half of the dataset
143
- df['sentiment'] = df["message"].map(predict_sentiment_bert)
144
 
145
  # Filter out rows with null lemmatized_message
146
- df = df.dropna(subset=['lemmatized_message'])
147
 
148
  # **Fix: Use a custom stop word list**
149
  vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
150
- dtm = vectorizer.fit_transform(df['lemmatized_message'])
 
 
 
 
 
151
 
152
  # Apply LDA
153
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
@@ -155,15 +194,17 @@ def preprocess(data):
155
 
156
  # Assign topics to messages
157
  topic_results = lda.transform(dtm)
158
- df = df.iloc[:topic_results.shape[0]].copy()
159
- df['topic'] = topic_results.argmax(axis=1)
160
 
161
  # Store topics for visualization
162
  topics = []
163
  for topic in lda.components_:
164
  topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
165
-
166
 
167
- print(topics)
168
- print(type(topics))
169
- return df,topic
 
 
 
 
1
  import re
2
  import pandas as pd
3
  # from sentiment_train import predict_sentiment
4
+ from sentiment import predict_sentiment_bert_batch
5
  import spacy
6
  from langdetect import detect, LangDetectException
7
  from sklearn.feature_extraction.text import CountVectorizer
 
72
 
73
  return df, reduced_features, kmeans.cluster_centers_
74
 
75
+ def parse_data(data):
76
+ """
77
+ Parses the raw chat data into a DataFrame and performs basic cleaning.
78
+ """
79
+ # Optimization: Use pandas vectorized string operations instead of looping
80
+
81
+ # Split lines
82
+ lines = data.strip().split("\n")
83
+ df = pd.DataFrame({'line': lines})
84
+
85
+ # Extract Date, Time, Sender, Message using regex
86
  pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
87
+
88
+ extracted = df['line'].str.extract(pattern)
89
+
90
+ # Drop lines that didn't match (if any)
91
+ extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
92
+
93
+ # Combine Date and Time
94
+ extracted['Time'] = extracted['Time'].str.replace(' ', ' ', regex=False)
95
+ extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
96
+
97
+ # Handle Sender
98
+ extracted['Sender'] = extracted['Sender'].fillna('group_notification')
99
+
100
+ # Rename columns
101
+ df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
102
+
103
+ # Filter out system messages
104
+ df = df[df['user'].str.lower() != 'system']
105
+
106
+ # Convert date
107
+ df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
108
+
109
+ # Filter out invalid dates
110
+ df = df.dropna(subset=['date'])
111
+
112
+ # Filter out group notifications
 
 
 
 
 
 
 
113
  df = df[df["user"] != "group_notification"]
114
  df.reset_index(drop=True, inplace=True)
115
 
116
+ # unfiltered messages
117
  df["unfiltered_messages"] = df["message"]
118
  # Clean messages
119
  df["message"] = df["message"].apply(clean_message)
120
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # Extract time-based features
122
  df['year'] = df['date'].dt.year
123
  df['month'] = df['date'].dt.month_name()
 
126
  df['day_of_week'] = df['date'].dt.day_name()
127
  df['minute'] = df['date'].dt.minute
128
 
129
+ period = []
130
+ for hour in df['hour']:
131
+ if hour == 23:
132
+ period.append(str(hour) + "-" + str('00'))
133
+ elif hour == 0:
134
+ period.append(str('00') + "-" + str(hour + 1))
135
+ else:
136
+ period.append(str(hour) + "-" + str(hour + 1))
137
+
138
+ df['period'] = period
139
+
140
+ return df
141
+
142
+ def analyze_sentiment_and_topics(df):
143
+ """
144
+ Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
145
+ Includes sampling for large datasets.
146
+ """
147
+ # Sampling Logic: Cap at 5000 messages for deep analysis
148
+ original_df_len = len(df)
149
+ if len(df) > 5000:
150
+ print(f"Sampling 5000 messages from {len(df)}...")
151
+ # We keep the original index to potentially map back, but for now we just work on the sample
152
+ df_sample = df.sample(5000, random_state=42).copy()
153
+ else:
154
+ df_sample = df.copy()
155
+
156
+ # Filter and lemmatize messages
157
+ lemmatized_messages = []
158
+ # Optimization: Detect dominant language on a sample
159
+ sample_size = min(len(df_sample), 500)
160
+ sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
161
+ try:
162
+ dominant_lang = detect(sample_text)
163
+ except LangDetectException:
164
+ dominant_lang = 'en'
165
+
166
+ nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
167
+
168
+ # Use nlp.pipe for batch processing
169
+ lemmatized_messages = []
170
+ for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
171
+ lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))
172
+
173
+ df_sample["lemmatized_message"] = lemmatized_messages
174
+
175
  # Apply sentiment analysis
176
+ # Use batch processing for speed
177
+ df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)
178
 
179
  # Filter out rows with null lemmatized_message
180
+ df_sample = df_sample.dropna(subset=['lemmatized_message'])
181
 
182
  # **Fix: Use a custom stop word list**
183
  vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
184
+ try:
185
+ dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
186
+ except ValueError:
187
+ # Handle case where vocabulary is empty (e.g. all stop words)
188
+ print("Warning: Empty vocabulary after filtering. Returning empty topics.")
189
+ return df_sample, []
190
 
191
  # Apply LDA
192
  lda = LatentDirichletAllocation(n_components=5, random_state=42)
 
194
 
195
  # Assign topics to messages
196
  topic_results = lda.transform(dtm)
197
+ df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
198
+ df_sample['topic'] = topic_results.argmax(axis=1)
199
 
200
  # Store topics for visualization
201
  topics = []
202
  for topic in lda.components_:
203
  topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
 
204
 
205
+ # If we sampled, we return the sampled dataframe with sentiment/topics.
206
+ # The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
207
+ # Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
208
+ # For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
209
+
210
+ return df_sample, topics