File size: 4,616 Bytes
f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d c1e18bd f05b66d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import re
import pandas as pd
import numpy as np
def preprocess(data):
print("Preprocess started")
# NEW ROBUST REGEX PATTERN: Supports both 12-hour (H:MM AM/PM) and 24-hour (HH:MM) formats.
# It captures: Day/Month/Year, Space, Time (H:MM or HH:MM), optional AM/PM/unicode space, dash, space.
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM|PM|am|pm))?) - '
# --- STEP 1: Separate metadata lines ---
# WhatsApp exports often have an initial line about end-to-end encryption.
data_lines = data.split('\n')
cleaned_lines = []
# We strip out the encryption header line or any preceding junk
start_index = 0
for i, line in enumerate(data_lines):
if re.search(pattern, line):
start_index = i
break
# Join the message content back starting from the first actual chat line
data = '\n'.join(data_lines[start_index:])
# --- STEP 2: Split Messages and Dates (using the capturing groups in the pattern) ---
# Extract messages: split the entire data string by the pattern
messages = re.split(pattern, data)[3::3] # Take every 3rd element starting from the 3rd index (the message content)
# Extract date/time stamps (they are the 1st and 2nd capturing group of every match)
matches = re.findall(pattern, data)
dates = []
for match in matches:
date_part = match[0] # e.g., '19/11/2023'
time_part = match[1] # e.g., '07:43' or '8:09 am'
# Combine date and time, stripping the unicode space that often appears in the time part
combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip()
dates.append(combined_dt)
print(f"Found {len(messages)} messages and {len(dates)} dates")
if len(messages) != len(dates) or len(messages) == 0:
print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
# Returning None ensures Streamlit handles the parsing failure gracefully.
return None
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
# --- STEP 3: Robust Date Parsing (Trying 12h, 24h, and 2/4 digit year formats) ---
# 1. Standard 12-hour format (e.g., 01/01/2025, 8:09 AM) - Robust Year
format_12h_4y = '%d/%m/%Y, %I:%M %p'
# 2. Standard 24-hour format (e.g., 19/11/2023, 07:43) - Robust Year
format_24h_4y = '%d/%m/%Y, %H:%M'
# 3. Standard 12-hour format - 2 Digit Year
format_12h_2y = '%d/%m/%y, %I:%M %p'
# 4. Standard 24-hour format - 2 Digit Year
format_24h_2y = '%d/%m/%y, %H:%M'
# Convert 'message_date' column to list of strings for processing
date_series = df['message_date']
# Initialize 'date' column with NaT (Not a Time)
df['date'] = pd.NaT
# List of formats to try, in order of likelihood
formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y]
for format_str in formats_to_try:
unparsed = df['date'].isna()
if unparsed.any():
# Try parsing the remaining unparsed dates with the current format string
df.loc[unparsed, 'date'] = pd.to_datetime(
df.loc[unparsed, 'message_date'],
format=format_str,
errors='coerce'
)
# Drop rows where parsing failed with all formats
df.dropna(subset=['date'], inplace=True)
if df.empty:
print("Error: DataFrame is empty after parsing dates. All date formats failed.")
return None
df.rename(columns={'message_date': 'timestamp_string'}, inplace=True)
df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification')
df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip())
# Clean up group notifications and drops
df.drop(columns=['user_message'], inplace=True)
df = df[df['user'] != 'group_notification'].copy()
# --- STEP 4: Add Metadata Columns ---
df['only_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
print(f"Preprocess finished with {df.shape[0]} valid messages.")
return df |