File size: 3,104 Bytes
3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 6513ecd fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c fb999e2 3fc588c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import re
import pandas as pd
def preprocess(data):
"""
Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
"""
print("Preprocess started")
# Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
# Split the data by the timestamp pattern (case-insensitive flag re.I).
messages = re.split(pattern, data, flags=re.I)[1:]
# Find all occurrences of the timestamp pattern.
date = re.findall(pattern, data, flags=re.I)
print(f"Found {len(messages)} messages and {len(date)} dates")
if len(messages) != len(date):
print("Error: The number of messages and dates do not match.")
return None
# Clean up dates before processing by removing non-breaking spaces
dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
# --- Date Parsing: Try common formats ---
known_formats = [
'%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am)
'%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year)
'%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year)
]
df['date'] = pd.NaT
for fmt in known_formats:
converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
# Fill existing NaT values with successful conversions
df['date'] = df['date'].fillna(converted)
if df['date'].isna().any():
print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
df.rename(columns={'date': 'date'}, inplace=True)
# --- Extract Users and Messages ---
users = []
messages_list = []
# Regex to capture sender name non-greedily up to the colon separator.
user_pattern = r'^([\w\W]+?):\s'
for message in df['user_message']:
entry = re.split(user_pattern, message, maxsplit=1)
if len(entry) > 1: # Standard message format: Sender: Message
users.append(entry[1].strip())
messages_list.append(entry[2].strip())
else: # Group notification or metadata
users.append('group_notification')
messages_list.append(entry[0].strip())
df['user'] = users
df['message'] = messages_list
# --- Final Cleanup and Feature Creation ---
df.drop(columns=['user_message', 'message_date'], inplace=True)
df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
# Extract temporal features
df['only_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
return df |