Spaces:

SandhyaRaghav
/

whatsapp-chat-analyzer

Sleeping

App Files Files Community

SandhyaRaghav commited on Nov 11, 2025

Commit

c1e18bd

verified ·

1 Parent(s): fa23c4d

Update preprocessor.py

Browse files

Files changed (1) hide show

preprocessor.py +70 -62

preprocessor.py CHANGED Viewed

@@ -5,88 +5,95 @@ import numpy as np
 def preprocess(data):
     print("Preprocess started")
-    # This regex pattern captures the date/time string, allowing for 2 or 4 digit years
-    # and accommodating various Unicode spaces/separators ([\s\u202f\u00a0]?)
-    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s(?:1[0-2]|0?[1-9]):[0-5][0-9][\s\u202f\u00a0]?(?:AM|PM|am|pm)\s-\s'
-    # --- STEP 1: Clean/Prepare Data ---
     data_lines = data.split('\n')
     cleaned_lines = []
-    first_message_found = False
-    for line in data_lines:
-        if re.match(pattern, line):
-            cleaned_lines.append(line)
-            first_message_found = True
-        elif not first_message_found and line and 'end-to-end encrypted' in line:
-            continue
-        elif line and first_message_found:
-             # Append multi-line messages
-            cleaned_lines.append(line)
-        elif line and not first_message_found:
-             # Skip other junk lines before the first message
-             continue
-    data = '\n'.join(cleaned_lines)
-    # --- STEP 2: Split Messages and Dates ---
-    messages = re.split(pattern, data)[1:]
-    date = re.findall(pattern, data)
-    print(f"Found {len(messages)} messages and {len(date)} dates")
-    dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ').strip() for d in date]
     if len(messages) != len(dates) or len(messages) == 0:
         print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
         return None
     df = pd.DataFrame({'user_message': messages, 'message_date': dates})
-    # --- STEP 3: Robust Date Parsing (Trying two common year formats) ---
-    # Attempt 1: 4-digit year format (Standard for newer exports)
-    format_4_digit_year = '%d/%m/%Y, %I:%M %p - '
-    # Attempt 2: 2-digit year format (Used by some older exports)
-    format_2_digit_year = '%d/%m/%y, %I:%M %p - '
-    # We rely on pandas 'errors="coerce"' to tell us if the first format worked.
-    df['date'] = pd.to_datetime(df['message_date'], format=format_4_digit_year, errors='coerce')
-    # Find which dates failed to parse (NaT = Not a Time)
-    unparsed_dates = df['date'].isna()
-    # If any dates failed, try parsing them with the 2-digit year format
-    if unparsed_dates.any():
-        df.loc[unparsed_dates, 'date'] = pd.to_datetime(
-            df.loc[unparsed_dates, 'message_date'],
-            format=format_2_digit_year,
-            errors='coerce'
-        )
-    # Drop rows where neither format worked
     df.dropna(subset=['date'], inplace=True)
     if df.empty:
-        print("Error: DataFrame is empty after parsing dates. Neither 4-digit nor 2-digit year formats worked.")
         return None
-    # --- STEP 4: Split User and Message Content ---
-    users = []
-    messages_list = []
-    for message in df['user_message']:
-        entry = re.split(r'([\w\W]+?):\s', message, 1)
-        if len(entry) > 2:
-            users.append(entry[1].strip())
-            messages_list.append(entry[2].strip())
-        else:
-            users.append('group_notification')
-            messages_list.append(entry[0].strip())
-    df['user'] = users
-    df['message'] = messages_list
-    # --- STEP 5: Add Metadata Columns ---
-    df.drop(columns=['user_message', 'message_date'], inplace=True)
     df['only_date'] = df['date'].dt.date
     df['year'] = df['date'].dt.year
     df['month_num'] = df['date'].dt.month
@@ -96,4 +103,5 @@ def preprocess(data):
     df['hour'] = df['date'].dt.hour
     df['minute'] = df['date'].dt.minute
     return df

 def preprocess(data):
     print("Preprocess started")
+    # NEW ROBUST REGEX PATTERN: Supports both 12-hour (H:MM AM/PM) and 24-hour (HH:MM) formats.
+    # It captures: Day/Month/Year, Space, Time (H:MM or HH:MM), optional AM/PM/unicode space, dash, space.
+    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM|PM|am|pm))?) - '
+    # --- STEP 1: Separate metadata lines ---
+    # WhatsApp exports often have an initial line about end-to-end encryption.
     data_lines = data.split('\n')
     cleaned_lines = []
+    # We strip out the encryption header line or any preceding junk
+    start_index = 0
+    for i, line in enumerate(data_lines):
+        if re.search(pattern, line):
+            start_index = i
+            break
+    # Join the message content back starting from the first actual chat line
+    data = '\n'.join(data_lines[start_index:])
+    # --- STEP 2: Split Messages and Dates (using the capturing groups in the pattern) ---
+    # Extract messages: split the entire data string by the pattern
+    messages = re.split(pattern, data)[3::3] # Take every 3rd element starting from the 3rd index (the message content)
+    # Extract date/time stamps (they are the 1st and 2nd capturing group of every match)
+    matches = re.findall(pattern, data)
+    dates = []
+    for match in matches:
+        date_part = match[0] # e.g., '19/11/2023'
+        time_part = match[1] # e.g., '07:43' or '8:09 am'
+        # Combine date and time, stripping the unicode space that often appears in the time part
+        combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip()
+        dates.append(combined_dt)
+    print(f"Found {len(messages)} messages and {len(dates)} dates")
     if len(messages) != len(dates) or len(messages) == 0:
         print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
+        # Returning None ensures Streamlit handles the parsing failure gracefully.
         return None
     df = pd.DataFrame({'user_message': messages, 'message_date': dates})
+    # --- STEP 3: Robust Date Parsing (Trying 12h, 24h, and 2/4 digit year formats) ---
+    # 1. Standard 12-hour format (e.g., 01/01/2025, 8:09 AM) - Robust Year
+    format_12h_4y = '%d/%m/%Y, %I:%M %p'
+    # 2. Standard 24-hour format (e.g., 19/11/2023, 07:43) - Robust Year
+    format_24h_4y = '%d/%m/%Y, %H:%M'
+    # 3. Standard 12-hour format - 2 Digit Year
+    format_12h_2y = '%d/%m/%y, %I:%M %p'
+    # 4. Standard 24-hour format - 2 Digit Year
+    format_24h_2y = '%d/%m/%y, %H:%M'
+    # Convert 'message_date' column to list of strings for processing
+    date_series = df['message_date']
+    # Initialize 'date' column with NaT (Not a Time)
+    df['date'] = pd.NaT
+    # List of formats to try, in order of likelihood
+    formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y]
+    for format_str in formats_to_try:
+        unparsed = df['date'].isna()
+        if unparsed.any():
+            # Try parsing the remaining unparsed dates with the current format string
+            df.loc[unparsed, 'date'] = pd.to_datetime(
+                df.loc[unparsed, 'message_date'],
+                format=format_str,
+                errors='coerce'
+            )
+    # Drop rows where parsing failed with all formats
     df.dropna(subset=['date'], inplace=True)
     if df.empty:
+        print("Error: DataFrame is empty after parsing dates. All date formats failed.")
         return None
+    df.rename(columns={'message_date': 'timestamp_string'}, inplace=True)
+    df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification')
+    df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip())
+    # Clean up group notifications and drops
+    df.drop(columns=['user_message'], inplace=True)
+    df = df[df['user'] != 'group_notification'].copy()
+    # --- STEP 4: Add Metadata Columns ---
     df['only_date'] = df['date'].dt.date
     df['year'] = df['date'].dt.year
     df['month_num'] = df['date'].dt.month
     df['hour'] = df['date'].dt.hour
     df['minute'] = df['date'].dt.minute
+    print(f"Preprocess finished with {df.shape[0]} valid messages.")
     return df