|
|
import re |
|
|
import pandas as pd |
|
|
|
|
|
def preprocess(data): |
|
|
""" |
|
|
Preprocesses raw WhatsApp chat data into a structured pandas DataFrame. |
|
|
""" |
|
|
print("Preprocess started") |
|
|
|
|
|
|
|
|
pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*' |
|
|
|
|
|
|
|
|
messages = re.split(pattern, data, flags=re.I)[1:] |
|
|
|
|
|
|
|
|
date = re.findall(pattern, data, flags=re.I) |
|
|
|
|
|
print(f"Found {len(messages)} messages and {len(date)} dates") |
|
|
|
|
|
if len(messages) != len(date): |
|
|
print("Error: The number of messages and dates do not match.") |
|
|
return None |
|
|
|
|
|
|
|
|
dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date] |
|
|
df = pd.DataFrame({'user_message': messages, 'message_date': dates}) |
|
|
|
|
|
|
|
|
known_formats = [ |
|
|
'%d/%m/%y, %I:%M %p - ', |
|
|
'%d/%m/%Y, %I:%M %p - ', |
|
|
'%d/%m/%Y, %I:%M:%S %p - ' |
|
|
] |
|
|
|
|
|
df['date'] = pd.NaT |
|
|
for fmt in known_formats: |
|
|
converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce') |
|
|
|
|
|
df['date'] = df['date'].fillna(converted) |
|
|
|
|
|
if df['date'].isna().any(): |
|
|
print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.") |
|
|
|
|
|
df.rename(columns={'date': 'date'}, inplace=True) |
|
|
|
|
|
|
|
|
users = [] |
|
|
messages_list = [] |
|
|
|
|
|
|
|
|
user_pattern = r'^([\w\W]+?):\s' |
|
|
|
|
|
for message in df['user_message']: |
|
|
entry = re.split(user_pattern, message, maxsplit=1) |
|
|
|
|
|
if len(entry) > 1: |
|
|
users.append(entry[1].strip()) |
|
|
messages_list.append(entry[2].strip()) |
|
|
else: |
|
|
users.append('group_notification') |
|
|
messages_list.append(entry[0].strip()) |
|
|
|
|
|
df['user'] = users |
|
|
df['message'] = messages_list |
|
|
|
|
|
|
|
|
df.drop(columns=['user_message', 'message_date'], inplace=True) |
|
|
|
|
|
df.dropna(subset=['date'], inplace=True) |
|
|
|
|
|
|
|
|
df['only_date'] = df['date'].dt.date |
|
|
df['year'] = df['date'].dt.year |
|
|
df['month_num'] = df['date'].dt.month |
|
|
df['month'] = df['date'].dt.month_name() |
|
|
df['day'] = df['date'].dt.day |
|
|
df['day_name'] = df['date'].dt.day_name() |
|
|
df['hour'] = df['date'].dt.hour |
|
|
df['minute'] = df['date'].dt.minute |
|
|
|
|
|
return df |