File size: 4,616 Bytes
f05b66d
 
 
 
 
 
 
c1e18bd
 
 
f05b66d
c1e18bd
 
f05b66d
 
 
c1e18bd
 
 
 
 
 
f05b66d
c1e18bd
 
 
 
f05b66d
c1e18bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f05b66d
 
c1e18bd
f05b66d
 
 
 
c1e18bd
f05b66d
c1e18bd
 
 
 
 
 
 
 
 
 
 
 
 
 
f05b66d
c1e18bd
 
f05b66d
c1e18bd
 
 
 
 
 
 
 
 
f05b66d
c1e18bd
f05b66d
 
c1e18bd
f05b66d
 
c1e18bd
 
 
 
 
 
 
 
 
f05b66d
 
 
 
 
 
 
 
 
c1e18bd
f05b66d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import re
import pandas as pd
import numpy as np

def preprocess(data):
    print("Preprocess started")

    # NEW ROBUST REGEX PATTERN: Supports both 12-hour (H:MM AM/PM) and 24-hour (HH:MM) formats.
    # It captures: Day/Month/Year, Space, Time (H:MM or HH:MM), optional AM/PM/unicode space, dash, space.
    pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM|PM|am|pm))?) - '
    
    # --- STEP 1: Separate metadata lines ---
    # WhatsApp exports often have an initial line about end-to-end encryption.
    data_lines = data.split('\n')
    cleaned_lines = []
    
    # We strip out the encryption header line or any preceding junk
    start_index = 0
    for i, line in enumerate(data_lines):
        if re.search(pattern, line):
            start_index = i
            break
    
    # Join the message content back starting from the first actual chat line
    data = '\n'.join(data_lines[start_index:])
    
    # --- STEP 2: Split Messages and Dates (using the capturing groups in the pattern) ---
    
    # Extract messages: split the entire data string by the pattern
    messages = re.split(pattern, data)[3::3] # Take every 3rd element starting from the 3rd index (the message content)
    
    # Extract date/time stamps (they are the 1st and 2nd capturing group of every match)
    matches = re.findall(pattern, data)
    
    dates = []
    for match in matches:
        date_part = match[0] # e.g., '19/11/2023'
        time_part = match[1] # e.g., '07:43' or '8:09 am'
        # Combine date and time, stripping the unicode space that often appears in the time part
        combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip()
        dates.append(combined_dt)

    print(f"Found {len(messages)} messages and {len(dates)} dates")

    if len(messages) != len(dates) or len(messages) == 0:
        print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
        # Returning None ensures Streamlit handles the parsing failure gracefully.
        return None

    df = pd.DataFrame({'user_message': messages, 'message_date': dates})

    # --- STEP 3: Robust Date Parsing (Trying 12h, 24h, and 2/4 digit year formats) ---
    
    # 1. Standard 12-hour format (e.g., 01/01/2025, 8:09 AM) - Robust Year
    format_12h_4y = '%d/%m/%Y, %I:%M %p'
    # 2. Standard 24-hour format (e.g., 19/11/2023, 07:43) - Robust Year
    format_24h_4y = '%d/%m/%Y, %H:%M'
    # 3. Standard 12-hour format - 2 Digit Year
    format_12h_2y = '%d/%m/%y, %I:%M %p'
    # 4. Standard 24-hour format - 2 Digit Year
    format_24h_2y = '%d/%m/%y, %H:%M'
    
    # Convert 'message_date' column to list of strings for processing
    date_series = df['message_date']
    
    # Initialize 'date' column with NaT (Not a Time)
    df['date'] = pd.NaT

    # List of formats to try, in order of likelihood
    formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y]

    for format_str in formats_to_try:
        unparsed = df['date'].isna()
        if unparsed.any():
            # Try parsing the remaining unparsed dates with the current format string
            df.loc[unparsed, 'date'] = pd.to_datetime(
                df.loc[unparsed, 'message_date'], 
                format=format_str, 
                errors='coerce'
            )

    # Drop rows where parsing failed with all formats
    df.dropna(subset=['date'], inplace=True)
    if df.empty:
        print("Error: DataFrame is empty after parsing dates. All date formats failed.")
        return None

    df.rename(columns={'message_date': 'timestamp_string'}, inplace=True)
    df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification')
    df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip())
    
    # Clean up group notifications and drops
    df.drop(columns=['user_message'], inplace=True)
    df = df[df['user'] != 'group_notification'].copy()
    
    # --- STEP 4: Add Metadata Columns ---
    df['only_date'] = df['date'].dt.date
    df['year'] = df['date'].dt.year
    df['month_num'] = df['date'].dt.month
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute

    print(f"Preprocess finished with {df.shape[0]} valid messages.")
    return df