File size: 3,104 Bytes
3fc588c
 
 
 
fb999e2
 
 
 
3fc588c
fb999e2
 
3fc588c
fb999e2
 
3fc588c
fb999e2
 
3fc588c
fb999e2
3fc588c
fb999e2
 
3fc588c
 
fb999e2
 
 
6513ecd
fb999e2
 
 
 
 
 
 
3fc588c
fb999e2
 
 
3fc588c
 
fb999e2
 
 
3fc588c
 
fb999e2
3fc588c
fb999e2
 
 
 
3fc588c
 
 
 
fb999e2
3fc588c
fb999e2
 
3fc588c
fb999e2
3fc588c
 
fb999e2
3fc588c
fb999e2
 
 
 
 
 
3fc588c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import pandas as pd

def preprocess(data):
    """
    Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
    """
    print("Preprocess started")

    # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
    pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
    
    # Split the data by the timestamp pattern (case-insensitive flag re.I).
    messages = re.split(pattern, data, flags=re.I)[1:]
    
    # Find all occurrences of the timestamp pattern.
    date = re.findall(pattern, data, flags=re.I)
    
    print(f"Found {len(messages)} messages and {len(date)} dates")

    if len(messages) != len(date):
        print("Error: The number of messages and dates do not match.")
        return None

    # Clean up dates before processing by removing non-breaking spaces
    dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
    df = pd.DataFrame({'user_message': messages, 'message_date': dates})

    # --- Date Parsing: Try common formats ---
    known_formats = [
        '%d/%m/%y, %I:%M %p - ',        # Format without seconds (e.g., 01/01/25, 8:09 am)
        '%d/%m/%Y, %I:%M %p - ',        # Format without seconds (4-digit year)
        '%d/%m/%Y, %I:%M:%S %p - '      # Format with seconds (4-digit year)
    ]
    
    df['date'] = pd.NaT 
    for fmt in known_formats:
        converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
        # Fill existing NaT values with successful conversions
        df['date'] = df['date'].fillna(converted)
        
    if df['date'].isna().any():
        print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
    
    df.rename(columns={'date': 'date'}, inplace=True)

    # --- Extract Users and Messages ---
    users = []
    messages_list = []
    
    # Regex to capture sender name non-greedily up to the colon separator.
    user_pattern = r'^([\w\W]+?):\s' 
    
    for message in df['user_message']:
        entry = re.split(user_pattern, message, maxsplit=1)
        
        if len(entry) > 1: # Standard message format: Sender: Message
            users.append(entry[1].strip())
            messages_list.append(entry[2].strip())
        else: # Group notification or metadata
            users.append('group_notification')
            messages_list.append(entry[0].strip())

    df['user'] = users
    df['message'] = messages_list

    # --- Final Cleanup and Feature Creation ---
    df.drop(columns=['user_message', 'message_date'], inplace=True)
    
    df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed

    # Extract temporal features
    df['only_date'] = df['date'].dt.date
    df['year'] = df['date'].dt.year
    df['month_num'] = df['date'].dt.month
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute

    return df