Spaces:

SandhyaRaghav
/

whatsapp-chat-analyzer

Running

App Files Files Community

SandhyaRaghav commited on Nov 12, 2025

Commit

6513ecd

verified ·

1 Parent(s): 3fc588c

Update src/preprocessor.py

Browse files

Files changed (1) hide show

src/preprocessor.py +90 -45

src/preprocessor.py CHANGED Viewed

@@ -2,74 +2,107 @@ import re
 import pandas as pd
 def preprocess(data):
-    """
-    Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
-    """
-    print("Preprocess started")
-    # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
-    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
-    # Split the data by the timestamp pattern (case-insensitive flag re.I).
-    messages = re.split(pattern, data, flags=re.I)[1:]
-    # Find all occurrences of the timestamp pattern.
-    date = re.findall(pattern, data, flags=re.I)
-    print(f"Found {len(messages)} messages and {len(date)} dates")
-    if len(messages) != len(date):
-        print("Error: The number of messages and dates do not match.")
         return None
-    # Clean up dates before processing by removing non-breaking spaces
-    dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
-    df = pd.DataFrame({'user_message': messages, 'message_date': dates})
-    # --- Date Parsing: Try common formats ---
-    known_formats = [
-        '%d/%m/%y, %I:%M %p - ',        # Format without seconds (e.g., 01/01/25, 8:09 am)
-        '%d/%m/%Y, %I:%M %p - ',        # Format without seconds (4-digit year)
-        '%d/%m/%Y, %I:%M:%S %p - '      # Format with seconds (4-digit year)
-    ]
     df['date'] = pd.NaT
-    for fmt in known_formats:
-        converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
-        # Fill existing NaT values with successful conversions
         df['date'] = df['date'].fillna(converted)
-    if df['date'].isna().any():
-        print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
     df.rename(columns={'date': 'date'}, inplace=True)
-    # --- Extract Users and Messages ---
     users = []
-    messages_list = []
-    # Regex to capture sender name non-greedily up to the colon separator.
-    user_pattern = r'^([\w\W]+?):\s'
     for message in df['user_message']:
         entry = re.split(user_pattern, message, maxsplit=1)
-        if len(entry) > 1: # Standard message format: Sender: Message
             users.append(entry[1].strip())
-            messages_list.append(entry[2].strip())
-        else: # Group notification or metadata
             users.append('group_notification')
-            messages_list.append(entry[0].strip())
     df['user'] = users
-    df['message'] = messages_list
-    # --- Final Cleanup and Feature Creation ---
-    df.drop(columns=['user_message', 'message_date'], inplace=True)
-    df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
-    # Extract temporal features
     df['only_date'] = df['date'].dt.date
     df['year'] = df['date'].dt.year
     df['month_num'] = df['date'].dt.month
@@ -79,4 +112,16 @@ def preprocess(data):
     df['hour'] = df['date'].dt.hour
     df['minute'] = df['date'].dt.minute
     return df

 import pandas as pd
 def preprocess(data):
+    # --- 1. Define Flexible Regex Pattern ---
+    # The pattern is made flexible to accommodate:
+    # 1. 2-digit or 4-digit year (\d{2,4}).
+    # 2. Optional seconds (?::\d{2})?.
+    # 3. Optional AM/PM indicator with flexible spacing/non-breaking spaces (\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm))?.
+    # 4. The initial time component must match both 24H (00-23) and 12H (01-12) formats.
+    # We use non-capturing groups (?:...) where applicable.
+    # This pattern captures the full date-time string without explicitly using AM/PM in the regex capture,
+    # as pandas handles that in the final step.
+    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
+    # --- 2. Split Messages and Extract Date Strings ---
+    # re.split uses capturing groups from the pattern, leading to the structure: [noise, date_part1, date_part2, ..., message, date_part1, ...].
+    # We use list comprehension to reconstruct the raw timestamp string and extract the message content.
+    # Find all matches of the full raw date string for later alignment.
+    # The pattern is complex, so we will use re.split/re.findall to extract the message boundaries.
+    # Re-running findall, but being extra permissive to capture the whole raw date string for later parsing
+    date_pattern_raw = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
+    dates = re.findall(date_pattern_raw, data, flags=re.I)
+    # We now split the data to get the message content parts
+    messages = re.split(date_pattern_raw, data, flags=re.I)
+    messages = [m.strip() for m in messages if m.strip()]
+    # After splitting, the list contains interleaved date groups (3 groups) and message content.
+    # Reconstruct the raw date strings from the groups that were successfully found.
+    # The raw string is comprised of the Date, Time, and optional AM/PM groups.
+    raw_dates = []
+    # Index 0 is the leading noise. Date groups start at index 1 and repeat every 4 indices.
+    # The structure is [Noise, D1, T1, AP1, M1, D2, T2, AP2, M2, ...]
+    # The simple split approach used before is very fragile with capturing groups.
+    # Let's revert to a non-capturing split, then reconstruct the dates to keep it simple.
+    # Use a non-capturing group for the split to get clean message blocks.
+    message_blocks = re.split(r'(?:\d{1,2}/\d{1,2}/\d{2,4}),\s*\d{1,2}:\d{2}(?::\d{2})?\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*', data, flags=re.I)[1:]
+    # Re-find the full date strings to ensure alignment.
+    raw_date_strings = re.findall(date_pattern_raw, data, flags=re.I)
+    if len(message_blocks) != len(raw_date_strings):
+        # This occurs if non-date text exists before the first date in the file,
+        # which is handled by removing the first split element, or due to a bad pattern match.
+        print("Error: Message and date counts are mismatched. Check the first line of the chat file.")
         return None
+    # Clean up dates by joining and removing non-breaking spaces
+    dates_for_df = [" ".join(d).strip().replace('\u202f', ' ').replace('\u00a0', ' ') for d in raw_date_strings]
+    df = pd.DataFrame({'user_message': message_blocks, 'message_date': dates_for_df})
+    # --- 3. Robust Date Parsing ---
+    # We define all possible expected formats (24H, 12H, with/without seconds, 2/4-digit year)
+    DATE_FORMATS = [
+        # 12-Hour Formats (with AM/PM)
+        '%d/%m/%y, %I:%M %p - ',
+        '%d/%m/%Y, %I:%M %p - ',
+        '%d/%m/%y, %I:%M:%S %p - ',
+        '%d/%m/%Y, %I:%M:%S %p - ',
+        # 24-Hour Formats (without AM/PM)
+        '%d/%m/%y, %H:%M - ',
+        '%d/%m/%Y, %H:%M - ',
+        '%d/%m/%y, %H:%M:%S - ',
+        '%d/%m/%Y, %H:%M:%S - '
+    ]
     df['date'] = pd.NaT
+    # Iterate through all known formats, filling successful conversions and coercing failures to NaT
+    for fmt in DATE_FORMATS:
+        converted = pd.to_datetime(df['message_date'], format=fmt.strip(), errors='coerce')
         df['date'] = df['date'].fillna(converted)
     df.rename(columns={'date': 'date'}, inplace=True)
+    df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
+    # --- 4. Extract Users and Messages ---
     users = []
+    messages = []
+    user_pattern = r'^([\w\W]+?):\s'
     for message in df['user_message']:
+        # Split message into [Noise, User, Message Content]
         entry = re.split(user_pattern, message, maxsplit=1)
+        if len(entry) > 1:
             users.append(entry[1].strip())
+            messages.append(entry[2].strip())
+        else:
             users.append('group_notification')
+            messages.append(entry[0].strip())
     df['user'] = users
+    df['message'] = messages
+    df.drop(columns=['user_message'], inplace=True)
+    # --- 5. Extract Temporal Features ---
     df['only_date'] = df['date'].dt.date
     df['year'] = df['date'].dt.year
     df['month_num'] = df['date'].dt.month
     df['hour'] = df['date'].dt.hour
     df['minute'] = df['date'].dt.minute
+    # --- 6. Calculate Message Period ---
+    period = []
+    for hour in df['hour']:
+        if hour == 23:
+            period.append(str(hour) + "-" + str('00'))
+        elif hour == 0:
+            period.append(str('00') + "-" + str(hour + 1))
+        else:
+            period.append(str(hour) + "-" + str(hour + 1))
+    df['period'] = period
     return df