Spaces:

afanyu237
/

whatsApp_chat

Sleeping

App Files Files Community

afanyu237 commited on Dec 5, 2025

Commit

d67c92b

verified ·

1 Parent(s): 6cc22fb

Update preprocessor.py

Browse files

Files changed (1) hide show

preprocessor.py +103 -62

preprocessor.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import pandas as pd
 # from sentiment_train import predict_sentiment
-from sentiment import predict_sentiment_bert
 import spacy
 from langdetect import detect, LangDetectException
 from sklearn.feature_extraction.text import CountVectorizer
@@ -72,64 +72,52 @@ def preprocess_for_clustering(df, n_clusters=5):
     return df, reduced_features, kmeans.cluster_centers_
-def preprocess(data):
     pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
-    filtered_messages = []
-    valid_dates = []
-    for line in data.strip().split("\n"):
-        match = re.match(pattern, line)
-        if match:
-            entry = match.groupdict()
-            sender = entry.get("Sender")
-            if sender and sender.strip().lower() != "system":  # Remove system messages
-                filtered_messages.append(f"{sender.strip()}: {entry['Message']}")
-                valid_dates.append(f"{entry['Date']}, {entry['Time'].replace('â€¯', ' ')}")
-    # Create DataFrame
-    df = pd.DataFrame({'user_message': filtered_messages, 'message_date': valid_dates})
-    df['message_date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
-    df.rename(columns={'message_date': 'date'}, inplace=True)
-    # Separate Users and Messages
-    users, messages = [], []
-    msg_pattern = r"^(.*?):\s(.*)$"
-    for message in df["user_message"]:
-        match = re.match(msg_pattern, message)
-        if match:
-            users.append(match.group(1))
-            messages.append(match.group(2))
-        else:
-            users.append("group_notification")
-            messages.append(message)
-    df["user"] = users
-    df["message"] = messages
     df = df[df["user"] != "group_notification"]
     df.reset_index(drop=True, inplace=True)
-   # unfiltered  messages
     df["unfiltered_messages"] = df["message"]
     # Clean messages
     df["message"] = df["message"].apply(clean_message)
-    # Filter and lemmatize messages
-    lemmatized_messages = []
-    for message in df["message"]:
-        try:
-            lang = detect(message)
-            lemmatized_messages.append(lemmatize_text(message, lang))
-        except LangDetectException:
-            lemmatized_messages.append("")
-    df["lemmatized_message"] = lemmatized_messages
-    # Drop original column
-    df.drop(columns=["user_message"], inplace=True)
     # Extract time-based features
     df['year'] = df['date'].dt.year
     df['month'] = df['date'].dt.month_name()
@@ -138,16 +126,67 @@ def preprocess(data):
     df['day_of_week'] = df['date'].dt.day_name()
     df['minute'] = df['date'].dt.minute
     # Apply sentiment analysis
-    half_data = df.head(len(df) // 2)  # Select first half of the dataset
-    df['sentiment'] = df["message"].map(predict_sentiment_bert)
     # Filter out rows with null lemmatized_message
-    df = df.dropna(subset=['lemmatized_message'])
     # **Fix: Use a custom stop word list**
     vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
-    dtm = vectorizer.fit_transform(df['lemmatized_message'])
     # Apply LDA
     lda = LatentDirichletAllocation(n_components=5, random_state=42)
@@ -155,15 +194,17 @@ def preprocess(data):
     # Assign topics to messages
     topic_results = lda.transform(dtm)
-    df = df.iloc[:topic_results.shape[0]].copy()
-    df['topic'] = topic_results.argmax(axis=1)
     # Store topics for visualization
     topics = []
     for topic in lda.components_:
         topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
-    print(topics)
-    print(type(topics))
-    return df,topic

 import re
 import pandas as pd
 # from sentiment_train import predict_sentiment
+from sentiment import predict_sentiment_bert_batch
 import spacy
 from langdetect import detect, LangDetectException
 from sklearn.feature_extraction.text import CountVectorizer
     return df, reduced_features, kmeans.cluster_centers_
+def parse_data(data):
+    """
+    Parses the raw chat data into a DataFrame and performs basic cleaning.
+    """
+    # Optimization: Use pandas vectorized string operations instead of looping
+    # Split lines
+    lines = data.strip().split("\n")
+    df = pd.DataFrame({'line': lines})
+    # Extract Date, Time, Sender, Message using regex
     pattern = r"^(?P<Date>\d{1,2}/\d{1,2}/\d{2,4}),\s+(?P<Time>[\d:]+(?:\S*\s?[AP]M)?)\s+-\s+(?:(?P<Sender>.*?):\s+)?(?P<Message>.*)$"
+    extracted = df['line'].str.extract(pattern)
+    # Drop lines that didn't match (if any)
+    extracted = extracted.dropna(subset=['Date', 'Time', 'Message'])
+    # Combine Date and Time
+    extracted['Time'] = extracted['Time'].str.replace('â€¯', ' ', regex=False)
+    extracted['message_date'] = extracted['Date'] + ", " + extracted['Time']
+    # Handle Sender
+    extracted['Sender'] = extracted['Sender'].fillna('group_notification')
+    # Rename columns
+    df = extracted.rename(columns={'Sender': 'user', 'Message': 'message'})
+    # Filter out system messages
+    df = df[df['user'].str.lower() != 'system']
+    # Convert date
+    df['date'] = pd.to_datetime(df['message_date'], format='%m/%d/%y, %I:%M %p', errors='coerce')
+    # Filter out invalid dates
+    df = df.dropna(subset=['date'])
+    # Filter out group notifications
     df = df[df["user"] != "group_notification"]
     df.reset_index(drop=True, inplace=True)
+    # unfiltered  messages
     df["unfiltered_messages"] = df["message"]
     # Clean messages
     df["message"] = df["message"].apply(clean_message)
     # Extract time-based features
     df['year'] = df['date'].dt.year
     df['month'] = df['date'].dt.month_name()
     df['day_of_week'] = df['date'].dt.day_name()
     df['minute'] = df['date'].dt.minute
+    period = []
+    for hour in df['hour']:
+        if hour == 23:
+            period.append(str(hour) + "-" + str('00'))
+        elif hour == 0:
+            period.append(str('00') + "-" + str(hour + 1))
+        else:
+            period.append(str(hour) + "-" + str(hour + 1))
+    df['period'] = period
+    return df
+def analyze_sentiment_and_topics(df):
+    """
+    Performs heavy NLP tasks: Lemmatization, Sentiment Analysis, and Topic Modeling.
+    Includes sampling for large datasets.
+    """
+    # Sampling Logic: Cap at 5000 messages for deep analysis
+    original_df_len = len(df)
+    if len(df) > 5000:
+        print(f"Sampling 5000 messages from {len(df)}...")
+        # We keep the original index to potentially map back, but for now we just work on the sample
+        df_sample = df.sample(5000, random_state=42).copy()
+    else:
+        df_sample = df.copy()
+    # Filter and lemmatize messages
+    lemmatized_messages = []
+    # Optimization: Detect dominant language on a sample
+    sample_size = min(len(df_sample), 500)
+    sample_text = " ".join(df_sample["message"].sample(sample_size, random_state=42).tolist())
+    try:
+        dominant_lang = detect(sample_text)
+    except LangDetectException:
+        dominant_lang = 'en'
+    nlp = nlp_fr if dominant_lang == 'fr' else nlp_en
+    # Use nlp.pipe for batch processing
+    lemmatized_messages = []
+    for doc in nlp.pipe(df_sample["message"].tolist(), batch_size=1000, disable=["ner", "parser"]):
+        lemmatized_messages.append(" ".join([token.lemma_ for token in doc if not token.is_punct]))
+    df_sample["lemmatized_message"] = lemmatized_messages
     # Apply sentiment analysis
+    # Use batch processing for speed
+    df_sample['sentiment'] = predict_sentiment_bert_batch(df_sample["message"].tolist(), batch_size=128)
     # Filter out rows with null lemmatized_message
+    df_sample = df_sample.dropna(subset=['lemmatized_message'])
     # **Fix: Use a custom stop word list**
     vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words)
+    try:
+        dtm = vectorizer.fit_transform(df_sample['lemmatized_message'])
+    except ValueError:
+        # Handle case where vocabulary is empty (e.g. all stop words)
+        print("Warning: Empty vocabulary after filtering. Returning empty topics.")
+        return df_sample, []
     # Apply LDA
     lda = LatentDirichletAllocation(n_components=5, random_state=42)
     # Assign topics to messages
     topic_results = lda.transform(dtm)
+    df_sample = df_sample.iloc[:topic_results.shape[0]].copy()
+    df_sample['topic'] = topic_results.argmax(axis=1)
     # Store topics for visualization
     topics = []
     for topic in lda.components_:
         topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
+    # If we sampled, we return the sampled dataframe with sentiment/topics.
+    # The main app will need to handle that 'df' (full) and 'df_analyzed' (sample) might be different.
+    # Or we can try to merge back? Merging back 5000 sentiments to 40000 messages leaves 35000 nulls.
+    # For visualization purposes (pie charts, etc), using the sample is usually fine as it's representative.
+    return df_sample, topics