SandhyaRaghav commited on
Commit
3fc588c
·
verified ·
1 Parent(s): c1e18bd

Update src/preprocessor.py

Browse files
Files changed (1) hide show
  1. src/preprocessor.py +82 -48
src/preprocessor.py CHANGED
@@ -1,48 +1,82 @@
1
- import re
2
- import pandas as pd
3
-
4
- def preprocess(data):
5
- print("Preprocess started")
6
-
7
- pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s(?:1[0-2]|0?[1-9]):[0-5][0-9][\s\u202f\u00a0]?(?:AM|PM|am|pm)\s-\s'
8
- messages = re.split(pattern, data)[1:]
9
- date = re.findall(pattern, data)
10
- print(f"Found {len(messages)} messages and {len(date)} dates")
11
-
12
- dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
13
- df = pd.DataFrame({'user_message': messages, 'message_date': dates})
14
-
15
- try:
16
- df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%y, %I:%M %p - ')
17
- except Exception as e:
18
- print("Date parsing error:", e)
19
- return None
20
-
21
- df.rename(columns={'message_date': 'date'}, inplace=True)
22
-
23
- users = []
24
- messages_list = []
25
- for message in df['user_message']:
26
- entry = re.split(r'([\w\W]+?):\s', message)
27
- if entry[1:]: # user exists
28
- users.append(entry[1])
29
- messages_list.append(" ".join(entry[2:]))
30
- else:
31
- users.append('group_notification')
32
- messages_list.append(entry[0])
33
-
34
- df['user'] = users
35
- df['message'] = messages_list
36
-
37
- df.drop(columns=['user_message'], inplace=True)
38
- df['only_date'] = df['date'].dt.date
39
- df['year'] = df['date'].dt.year
40
- df['month_num'] = df['date'].dt.month
41
- df['month'] = df['date'].dt.month_name()
42
- df['day'] = df['date'].dt.day
43
- df['day_name'] = df['date'].dt.day_name()
44
- df['hour'] = df['date'].dt.hour
45
- df['minute'] = df['date'].dt.minute
46
-
47
- return df
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+
4
+ def preprocess(data):
5
+ """
6
+ Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
7
+ """
8
+ print("Preprocess started")
9
+
10
+ # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
11
+ pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
12
+
13
+ # Split the data by the timestamp pattern (case-insensitive flag re.I).
14
+ messages = re.split(pattern, data, flags=re.I)[1:]
15
+
16
+ # Find all occurrences of the timestamp pattern.
17
+ date = re.findall(pattern, data, flags=re.I)
18
+
19
+ print(f"Found {len(messages)} messages and {len(date)} dates")
20
+
21
+ if len(messages) != len(date):
22
+ print("Error: The number of messages and dates do not match.")
23
+ return None
24
+
25
+ # Clean up dates before processing by removing non-breaking spaces
26
+ dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
27
+ df = pd.DataFrame({'user_message': messages, 'message_date': dates})
28
+
29
+ # --- Date Parsing: Try common formats ---
30
+ known_formats = [
31
+ '%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am)
32
+ '%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year)
33
+ '%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year)
34
+ ]
35
+
36
+ df['date'] = pd.NaT
37
+ for fmt in known_formats:
38
+ converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
39
+ # Fill existing NaT values with successful conversions
40
+ df['date'] = df['date'].fillna(converted)
41
+
42
+ if df['date'].isna().any():
43
+ print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
44
+
45
+ df.rename(columns={'date': 'date'}, inplace=True)
46
+
47
+ # --- Extract Users and Messages ---
48
+ users = []
49
+ messages_list = []
50
+
51
+ # Regex to capture sender name non-greedily up to the colon separator.
52
+ user_pattern = r'^([\w\W]+?):\s'
53
+
54
+ for message in df['user_message']:
55
+ entry = re.split(user_pattern, message, maxsplit=1)
56
+
57
+ if len(entry) > 1: # Standard message format: Sender: Message
58
+ users.append(entry[1].strip())
59
+ messages_list.append(entry[2].strip())
60
+ else: # Group notification or metadata
61
+ users.append('group_notification')
62
+ messages_list.append(entry[0].strip())
63
+
64
+ df['user'] = users
65
+ df['message'] = messages_list
66
+
67
+ # --- Final Cleanup and Feature Creation ---
68
+ df.drop(columns=['user_message', 'message_date'], inplace=True)
69
+
70
+ df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
71
+
72
+ # Extract temporal features
73
+ df['only_date'] = df['date'].dt.date
74
+ df['year'] = df['date'].dt.year
75
+ df['month_num'] = df['date'].dt.month
76
+ df['month'] = df['date'].dt.month_name()
77
+ df['day'] = df['date'].dt.day
78
+ df['day_name'] = df['date'].dt.day_name()
79
+ df['hour'] = df['date'].dt.hour
80
+ df['minute'] = df['date'].dt.minute
81
+
82
+ return df