SandhyaRaghav commited on
Commit
6513ecd
·
verified ·
1 Parent(s): 3fc588c

Update src/preprocessor.py

Browse files
Files changed (1) hide show
  1. src/preprocessor.py +90 -45
src/preprocessor.py CHANGED
@@ -2,74 +2,107 @@ import re
2
  import pandas as pd
3
 
4
  def preprocess(data):
5
- """
6
- Preprocesses raw WhatsApp chat data into a structured pandas DataFrame.
7
- """
8
- print("Preprocess started")
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Robust regex to capture the timestamp pattern including optional seconds and AM/PM variations.
11
- pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s*(?:1[0-2]|0?[1-9]):[0-5][0-9](?::[0-5][0-9])?\s*[\s\u202f\u00a0]?(?:AM|PM)\s*-\s*'
12
 
13
- # Split the data by the timestamp pattern (case-insensitive flag re.I).
14
- messages = re.split(pattern, data, flags=re.I)[1:]
15
 
16
- # Find all occurrences of the timestamp pattern.
17
- date = re.findall(pattern, data, flags=re.I)
18
 
19
- print(f"Found {len(messages)} messages and {len(date)} dates")
 
 
20
 
21
- if len(messages) != len(date):
22
- print("Error: The number of messages and dates do not match.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  return None
24
 
25
- # Clean up dates before processing by removing non-breaking spaces
26
- dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ') for d in date]
27
- df = pd.DataFrame({'user_message': messages, 'message_date': dates})
28
-
29
- # --- Date Parsing: Try common formats ---
30
- known_formats = [
31
- '%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am)
32
- '%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year)
33
- '%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year)
34
- ]
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  df['date'] = pd.NaT
37
- for fmt in known_formats:
38
- converted = pd.to_datetime(df['message_date'], format=fmt, errors='coerce')
39
- # Fill existing NaT values with successful conversions
40
  df['date'] = df['date'].fillna(converted)
41
 
42
- if df['date'].isna().any():
43
- print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
44
-
45
  df.rename(columns={'date': 'date'}, inplace=True)
 
46
 
47
- # --- Extract Users and Messages ---
48
  users = []
49
- messages_list = []
50
-
51
- # Regex to capture sender name non-greedily up to the colon separator.
52
- user_pattern = r'^([\w\W]+?):\s'
53
 
54
  for message in df['user_message']:
 
55
  entry = re.split(user_pattern, message, maxsplit=1)
56
 
57
- if len(entry) > 1: # Standard message format: Sender: Message
58
  users.append(entry[1].strip())
59
- messages_list.append(entry[2].strip())
60
- else: # Group notification or metadata
61
  users.append('group_notification')
62
- messages_list.append(entry[0].strip())
63
 
64
  df['user'] = users
65
- df['message'] = messages_list
 
66
 
67
- # --- Final Cleanup and Feature Creation ---
68
- df.drop(columns=['user_message', 'message_date'], inplace=True)
69
-
70
- df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
71
-
72
- # Extract temporal features
73
  df['only_date'] = df['date'].dt.date
74
  df['year'] = df['date'].dt.year
75
  df['month_num'] = df['date'].dt.month
@@ -79,4 +112,16 @@ def preprocess(data):
79
  df['hour'] = df['date'].dt.hour
80
  df['minute'] = df['date'].dt.minute
81
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return df
 
2
  import pandas as pd
3
 
4
  def preprocess(data):
5
+ # --- 1. Define Flexible Regex Pattern ---
6
+ # The pattern is made flexible to accommodate:
7
+ # 1. 2-digit or 4-digit year (\d{2,4}).
8
+ # 2. Optional seconds (?::\d{2})?.
9
+ # 3. Optional AM/PM indicator with flexible spacing/non-breaking spaces (\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm))?.
10
+ # 4. The initial time component must match both 24H (00-23) and 12H (01-12) formats.
11
+ # We use non-capturing groups (?:...) where applicable.
12
+
13
+ # This pattern captures the full date-time string without explicitly using AM/PM in the regex capture,
14
+ # as pandas handles that in the final step.
15
+ pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
16
+
17
+ # --- 2. Split Messages and Extract Date Strings ---
18
+ # re.split uses capturing groups from the pattern, leading to the structure: [noise, date_part1, date_part2, ..., message, date_part1, ...].
19
+ # We use list comprehension to reconstruct the raw timestamp string and extract the message content.
20
 
21
+ # Find all matches of the full raw date string for later alignment.
22
+ # The pattern is complex, so we will use re.split/re.findall to extract the message boundaries.
23
 
24
+ # Re-running findall, but being extra permissive to capture the whole raw date string for later parsing
25
+ date_pattern_raw = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
26
 
27
+ dates = re.findall(date_pattern_raw, data, flags=re.I)
 
28
 
29
+ # We now split the data to get the message content parts
30
+ messages = re.split(date_pattern_raw, data, flags=re.I)
31
+ messages = [m.strip() for m in messages if m.strip()]
32
 
33
+ # After splitting, the list contains interleaved date groups (3 groups) and message content.
34
+ # Reconstruct the raw date strings from the groups that were successfully found.
35
+ # The raw string is comprised of the Date, Time, and optional AM/PM groups.
36
+ raw_dates = []
37
+ # Index 0 is the leading noise. Date groups start at index 1 and repeat every 4 indices.
38
+ # The structure is [Noise, D1, T1, AP1, M1, D2, T2, AP2, M2, ...]
39
+
40
+ # The simple split approach used before is very fragile with capturing groups.
41
+ # Let's revert to a non-capturing split, then reconstruct the dates to keep it simple.
42
+
43
+ # Use a non-capturing group for the split to get clean message blocks.
44
+ message_blocks = re.split(r'(?:\d{1,2}/\d{1,2}/\d{2,4}),\s*\d{1,2}:\d{2}(?::\d{2})?\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*', data, flags=re.I)[1:]
45
+
46
+ # Re-find the full date strings to ensure alignment.
47
+ raw_date_strings = re.findall(date_pattern_raw, data, flags=re.I)
48
+
49
+ if len(message_blocks) != len(raw_date_strings):
50
+ # This occurs if non-date text exists before the first date in the file,
51
+ # which is handled by removing the first split element, or due to a bad pattern match.
52
+ print("Error: Message and date counts are mismatched. Check the first line of the chat file.")
53
  return None
54
 
55
+ # Clean up dates by joining and removing non-breaking spaces
56
+ dates_for_df = [" ".join(d).strip().replace('\u202f', ' ').replace('\u00a0', ' ') for d in raw_date_strings]
 
 
 
 
 
 
 
 
57
 
58
+ df = pd.DataFrame({'user_message': message_blocks, 'message_date': dates_for_df})
59
+
60
+ # --- 3. Robust Date Parsing ---
61
+ # We define all possible expected formats (24H, 12H, with/without seconds, 2/4-digit year)
62
+ DATE_FORMATS = [
63
+ # 12-Hour Formats (with AM/PM)
64
+ '%d/%m/%y, %I:%M %p - ',
65
+ '%d/%m/%Y, %I:%M %p - ',
66
+ '%d/%m/%y, %I:%M:%S %p - ',
67
+ '%d/%m/%Y, %I:%M:%S %p - ',
68
+
69
+ # 24-Hour Formats (without AM/PM)
70
+ '%d/%m/%y, %H:%M - ',
71
+ '%d/%m/%Y, %H:%M - ',
72
+ '%d/%m/%y, %H:%M:%S - ',
73
+ '%d/%m/%Y, %H:%M:%S - '
74
+ ]
75
+
76
  df['date'] = pd.NaT
77
+ # Iterate through all known formats, filling successful conversions and coercing failures to NaT
78
+ for fmt in DATE_FORMATS:
79
+ converted = pd.to_datetime(df['message_date'], format=fmt.strip(), errors='coerce')
80
  df['date'] = df['date'].fillna(converted)
81
 
 
 
 
82
  df.rename(columns={'date': 'date'}, inplace=True)
83
+ df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
84
 
85
+ # --- 4. Extract Users and Messages ---
86
  users = []
87
+ messages = []
88
+ user_pattern = r'^([\w\W]+?):\s'
 
 
89
 
90
  for message in df['user_message']:
91
+ # Split message into [Noise, User, Message Content]
92
  entry = re.split(user_pattern, message, maxsplit=1)
93
 
94
+ if len(entry) > 1:
95
  users.append(entry[1].strip())
96
+ messages.append(entry[2].strip())
97
+ else:
98
  users.append('group_notification')
99
+ messages.append(entry[0].strip())
100
 
101
  df['user'] = users
102
+ df['message'] = messages
103
+ df.drop(columns=['user_message'], inplace=True)
104
 
105
+ # --- 5. Extract Temporal Features ---
 
 
 
 
 
106
  df['only_date'] = df['date'].dt.date
107
  df['year'] = df['date'].dt.year
108
  df['month_num'] = df['date'].dt.month
 
112
  df['hour'] = df['date'].dt.hour
113
  df['minute'] = df['date'].dt.minute
114
 
115
+ # --- 6. Calculate Message Period ---
116
+ period = []
117
+ for hour in df['hour']:
118
+ if hour == 23:
119
+ period.append(str(hour) + "-" + str('00'))
120
+ elif hour == 0:
121
+ period.append(str('00') + "-" + str(hour + 1))
122
+ else:
123
+ period.append(str(hour) + "-" + str(hour + 1))
124
+
125
+ df['period'] = period
126
+
127
  return df