whatsapp-chat-analyzer / src /preprocessor.py
SandhyaRaghav's picture
Update src/preprocessor.py
fb999e2 verified
import re
import pandas as pd
def preprocess(data):
"""
Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
"""
print("Preprocess started")
# Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
# Split the data by the timestamp pattern (case-insensitive flag re.I).
messages = re.split(pattern, data, flags=re.I)[1:]
# Find all occurrences of the timestamp pattern.
date = re.findall(pattern, data, flags=re.I)
print(f"Found {len(messages)} messages and {len(date)} dates")
if len(messages) != len(date):
print("Error: The number of messages and dates do not match.")
return None
# Clean up dates before processing by removing non-breaking spaces
dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
# --- Date Parsing: Try common formats ---
known_formats = [
'%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am)
'%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year)
'%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year)
]
df['date'] = pd.NaT
for fmt in known_formats:
converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
# Fill existing NaT values with successful conversions
df['date'] = df['date'].fillna(converted)
if df['date'].isna().any():
print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
df.rename(columns={'date': 'date'}, inplace=True)
# --- Extract Users and Messages ---
users = []
messages_list = []
# Regex to capture sender name non-greedily up to the colon separator.
user_pattern = r'^([\w\W]+?):\s'
for message in df['user_message']:
entry = re.split(user_pattern, message, maxsplit=1)
if len(entry) > 1: # Standard message format: Sender: Message
users.append(entry[1].strip())
messages_list.append(entry[2].strip())
else: # Group notification or metadata
users.append('group_notification')
messages_list.append(entry[0].strip())
df['user'] = users
df['message'] = messages_list
# --- Final Cleanup and Feature Creation ---
df.drop(columns=['user_message', 'message_date'], inplace=True)
df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
# Extract temporal features
df['only_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
return df