Spaces:

SandhyaRaghav
/

whatsapp-chat-analyzer

Sleeping

App Files Files Community

SandhyaRaghav commited on Nov 11, 2025

Commit

3fc588c

verified ·

1 Parent(s): c1e18bd

Update src/preprocessor.py

Browse files

Files changed (1) hide show

src/preprocessor.py +82 -48

src/preprocessor.py CHANGED Viewed

@@ -1,48 +1,82 @@
-import re
-import pandas as pd
-def preprocess(data):
-    print("Preprocess started")
-    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s(?:1[0-2]|0?[1-9]):[0-5][0-9][\s\u202f\u00a0]?(?:AM|PM|am|pm)\s-\s'
-    messages = re.split(pattern, data)[1:]
-    date = re.findall(pattern, data)
-    print(f"Found {len(messages)} messages and {len(date)} dates")
-    dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
-    df = pd.DataFrame({'user_message': messages, 'message_date': dates})
-    try:
-        df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%y, %I:%M %p - ')
-    except Exception as e:
-        print("Date parsing error:", e)
-        return None
-    df.rename(columns={'message_date': 'date'}, inplace=True)
-    users = []
-    messages_list = []
-    for message in df['user_message']:
-        entry = re.split(r'([\w\W]+?):\s', message)
-        if entry[1:]:  # user exists
-            users.append(entry[1])
-            messages_list.append(" ".join(entry[2:]))
-        else:
-            users.append('group_notification')
-            messages_list.append(entry[0])
-    df['user'] = users
-    df['message'] = messages_list
-    df.drop(columns=['user_message'], inplace=True)
-    df['only_date'] = df['date'].dt.date
-    df['year'] = df['date'].dt.year
-    df['month_num'] = df['date'].dt.month
-    df['month'] = df['date'].dt.month_name()
-    df['day'] = df['date'].dt.day
-    df['day_name'] = df['date'].dt.day_name()
-    df['hour'] = df['date'].dt.hour
-    df['minute'] = df['date'].dt.minute
-    return df

+import re
+import pandas as pd
+def preprocess(data):
+    """
+    Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
+    """
+    print("Preprocess started")
+    # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
+    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
+    # Split the data by the timestamp pattern (case-insensitive flag re.I).
+    messages = re.split(pattern, data, flags=re.I)[1:]
+    # Find all occurrences of the timestamp pattern.
+    date = re.findall(pattern, data, flags=re.I)
+    print(f"Found {len(messages)} messages and {len(date)} dates")
+    if len(messages) != len(date):
+        print("Error: The number of messages and dates do not match.")
+        return None
+    # Clean up dates before processing by removing non-breaking spaces
+    dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
+    df = pd.DataFrame({'user_message': messages, 'message_date': dates})
+    # --- Date Parsing: Try common formats ---
+    known_formats = [
+        '%d/%m/%y, %I:%M %p - ',        # Format without seconds (e.g., 01/01/25, 8:09 am)
+        '%d/%m/%Y, %I:%M %p - ',        # Format without seconds (4-digit year)
+        '%d/%m/%Y, %I:%M:%S %p - '      # Format with seconds (4-digit year)
+    ]
+    df['date'] = pd.NaT
+    for fmt in known_formats:
+        converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
+        # Fill existing NaT values with successful conversions
+        df['date'] = df['date'].fillna(converted)
+    if df['date'].isna().any():
+        print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
+    df.rename(columns={'date': 'date'}, inplace=True)
+    # --- Extract Users and Messages ---
+    users = []
+    messages_list = []
+    # Regex to capture sender name non-greedily up to the colon separator.
+    user_pattern = r'^([\w\W]+?):\s'
+    for message in df['user_message']:
+        entry = re.split(user_pattern, message, maxsplit=1)
+        if len(entry) > 1: # Standard message format: Sender: Message
+            users.append(entry[1].strip())
+            messages_list.append(entry[2].strip())
+        else: # Group notification or metadata
+            users.append('group_notification')
+            messages_list.append(entry[0].strip())
+    df['user'] = users
+    df['message'] = messages_list
+    # --- Final Cleanup and Feature Creation ---
+    df.drop(columns=['user_message', 'message_date'], inplace=True)
+    df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
+    # Extract temporal features
+    df['only_date'] = df['date'].dt.date
+    df['year'] = df['date'].dt.year
+    df['month_num'] = df['date'].dt.month
+    df['month'] = df['date'].dt.month_name()
+    df['day'] = df['date'].dt.day
+    df['day_name'] = df['date'].dt.day_name()
+    df['hour'] = df['date'].dt.hour
+    df['minute'] = df['date'].dt.minute
+    return df