SURESHBEEKHANI's picture
Upload 11 files
7089d06 verified
import re
import pandas as pd
def parse_whatsapp_chat(data):
"""
Parses WhatsApp chat data and converts it into a structured pandas DataFrame.
Args:
data (str): The raw WhatsApp chat log as a string.
Returns:
pd.DataFrame: A DataFrame with columns for date, time, user, and message,
along with extracted components like day, month, year, hour, and period.
"""
# Define regex pattern to extract date, time, user, and message
pattern = r"(\d{2}/\d{2}/\d{4}),\s(\d{1,2}:\d{2}\s?[ap]m)\s-\s([\w\s+]+):\s(.*)"
# Extract matches using the regex pattern
matches = re.findall(pattern, data)
# Create a DataFrame from matches
df = pd.DataFrame(matches, columns=['Date', 'Time', 'User', 'Message'])
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['only_date'] = df['Date'].dt.date
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.strftime('%B')
df['month_num'] = df['Date'].dt.month
df['day_name'] = df['Date'].dt.strftime('%A')
# Split 'Time' into 'Hour', 'Minute', and 'AM/PM'
df[['Hour_Minute', 'AMPM']] = df['Time'].str.extract(r'(\d{1,2}:\d{2})\s?(am|pm)', expand=True)
df[['Hour', 'Minute']] = df['Hour_Minute'].str.split(':', expand=True)
df['Hour'] = df['Hour'].astype(int)
# Add time periods in HH AM/PM - HH AM/PM format
def format_period(hour, ampm):
next_hour = (hour + 1) % 12 or 12
next_ampm = 'pm' if hour == 11 and ampm == 'am' else \
'am' if hour == 11 and ampm == 'pm' else ampm
return f"{hour} {ampm} - {next_hour} {next_ampm}"
df['period'] = df.apply(lambda row: format_period(row['Hour'], row['AMPM']), axis=1)
return df