Spaces:

SandhyaRaghav
/

whatsapp-chat-analyzer

Sleeping

App Files Files Community

whatsapp-chat-analyzer / preprocessor.py

SandhyaRaghav

Update preprocessor.py

c1e18bd verified 3 months ago

raw

history blame contribute delete

4.62 kB

	import re
	import pandas as pd
	import numpy as np

	def preprocess(data):
	print("Preprocess started")

	# NEW ROBUST REGEX PATTERN: Supports both 12-hour (H:MM AM/PM) and 24-hour (HH:MM) formats.
	# It captures: Day/Month/Year, Space, Time (H:MM or HH:MM), optional AM/PM/unicode space, dash, space.
	pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM\|PM\|am\|pm))?) - '

	# --- STEP 1: Separate metadata lines ---
	# WhatsApp exports often have an initial line about end-to-end encryption.
	data_lines = data.split('\n')
	cleaned_lines = []

	# We strip out the encryption header line or any preceding junk
	start_index = 0
	for i, line in enumerate(data_lines):
	if re.search(pattern, line):
	start_index = i
	break

	# Join the message content back starting from the first actual chat line
	data = '\n'.join(data_lines[start_index:])

	# --- STEP 2: Split Messages and Dates (using the capturing groups in the pattern) ---

	# Extract messages: split the entire data string by the pattern
	messages = re.split(pattern, data)[3::3] # Take every 3rd element starting from the 3rd index (the message content)

	# Extract date/time stamps (they are the 1st and 2nd capturing group of every match)
	matches = re.findall(pattern, data)

	dates = []
	for match in matches:
	date_part = match[0] # e.g., '19/11/2023'
	time_part = match[1] # e.g., '07:43' or '8:09 am'
	# Combine date and time, stripping the unicode space that often appears in the time part
	combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip()
	dates.append(combined_dt)

	print(f"Found {len(messages)} messages and {len(dates)} dates")

	if len(messages) != len(dates) or len(messages) == 0:
	print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
	# Returning None ensures Streamlit handles the parsing failure gracefully.
	return None

	df = pd.DataFrame({'user_message': messages, 'message_date': dates})

	# --- STEP 3: Robust Date Parsing (Trying 12h, 24h, and 2/4 digit year formats) ---

	# 1. Standard 12-hour format (e.g., 01/01/2025, 8:09 AM) - Robust Year
	format_12h_4y = '%d/%m/%Y, %I:%M %p'
	# 2. Standard 24-hour format (e.g., 19/11/2023, 07:43) - Robust Year
	format_24h_4y = '%d/%m/%Y, %H:%M'
	# 3. Standard 12-hour format - 2 Digit Year
	format_12h_2y = '%d/%m/%y, %I:%M %p'
	# 4. Standard 24-hour format - 2 Digit Year
	format_24h_2y = '%d/%m/%y, %H:%M'

	# Convert 'message_date' column to list of strings for processing
	date_series = df['message_date']

	# Initialize 'date' column with NaT (Not a Time)
	df['date'] = pd.NaT

	# List of formats to try, in order of likelihood
	formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y]

	for format_str in formats_to_try:
	unparsed = df['date'].isna()
	if unparsed.any():
	# Try parsing the remaining unparsed dates with the current format string
	df.loc[unparsed, 'date'] = pd.to_datetime(
	df.loc[unparsed, 'message_date'],
	format=format_str,
	errors='coerce'
	)

	# Drop rows where parsing failed with all formats
	df.dropna(subset=['date'], inplace=True)
	if df.empty:
	print("Error: DataFrame is empty after parsing dates. All date formats failed.")
	return None

	df.rename(columns={'message_date': 'timestamp_string'}, inplace=True)
	df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification')
	df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip())

	# Clean up group notifications and drops
	df.drop(columns=['user_message'], inplace=True)
	df = df[df['user'] != 'group_notification'].copy()

	# --- STEP 4: Add Metadata Columns ---
	df['only_date'] = df['date'].dt.date
	df['year'] = df['date'].dt.year
	df['month_num'] = df['date'].dt.month
	df['month'] = df['date'].dt.month_name()
	df['day'] = df['date'].dt.day
	df['day_name'] = df['date'].dt.day_name()
	df['hour'] = df['date'].dt.hour
	df['minute'] = df['date'].dt.minute

	print(f"Preprocess finished with {df.shape[0]} valid messages.")
	return df