import re import pandas as pd def preprocess(data): """ Preprocesses raw WhatsApp chat data into a structured pandas DataFrame. """ print("Preprocess started") # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations. pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*' # Split the data by the timestamp pattern (case-insensitive flag re.I). messages = re.split(pattern, data, flags=re.I)[1:] # Find all occurrences of the timestamp pattern. date = re.findall(pattern, data, flags=re.I) print(f"Found {len(messages)} messages and {len(date)} dates") if len(messages) != len(date): print("Error: The number of messages and dates do not match.") return None # Clean up dates before processing by removing non-breaking spaces dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date] df = pd.DataFrame({'user_message': messages, 'message_date': dates}) # --- Date Parsing: Try common formats --- known_formats = [ '%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am) '%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year) '%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year) ] df['date'] = pd.NaT for fmt in known_formats: converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce') # Fill existing NaT values with successful conversions df['date'] = df['date'].fillna(converted) if df['date'].isna().any(): print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.") df.rename(columns={'date': 'date'}, inplace=True) # --- Extract Users and Messages --- users = [] messages_list = [] # Regex to capture sender name non-greedily up to the colon separator. user_pattern = r'^([\w\W]+?):\s' for message in df['user_message']: entry = re.split(user_pattern, message, maxsplit=1) if len(entry) > 1: # Standard message format: Sender: Message users.append(entry[1].strip()) messages_list.append(entry[2].strip()) else: # Group notification or metadata users.append('group_notification') messages_list.append(entry[0].strip()) df['user'] = users df['message'] = messages_list # --- Final Cleanup and Feature Creation --- df.drop(columns=['user_message', 'message_date'], inplace=True) df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed # Extract temporal features df['only_date'] = df['date'].dt.date df['year'] = df['date'].dt.year df['month_num'] = df['date'].dt.month df['month'] = df['date'].dt.month_name() df['day'] = df['date'].dt.day df['day_name'] = df['date'].dt.day_name() df['hour'] = df['date'].dt.hour df['minute'] = df['date'].dt.minute return df