SandhyaRaghav commited on
Commit
c1e18bd
·
verified ·
1 Parent(s): fa23c4d

Update preprocessor.py

Browse files
Files changed (1) hide show
  1. preprocessor.py +70 -62
preprocessor.py CHANGED
@@ -5,88 +5,95 @@ import numpy as np
5
  def preprocess(data):
6
  print("Preprocess started")
7
 
8
- # This regex pattern captures the date/time string, allowing for 2 or 4 digit years
9
- # and accommodating various Unicode spaces/separators ([\s\u202f\u00a0]?)
10
- pattern = r'\d{1,2}/\d{1,2}/\d{2,4},\s(?:1[0-2]|0?[1-9]):[0-5][0-9][\s\u202f\u00a0]?(?:AM|PM|am|pm)\s-\s'
11
 
12
- # --- STEP 1: Clean/Prepare Data ---
 
13
  data_lines = data.split('\n')
14
  cleaned_lines = []
15
- first_message_found = False
16
-
17
- for line in data_lines:
18
- if re.match(pattern, line):
19
- cleaned_lines.append(line)
20
- first_message_found = True
21
- elif not first_message_found and line and 'end-to-end encrypted' in line:
22
- continue
23
- elif line and first_message_found:
24
- # Append multi-line messages
25
- cleaned_lines.append(line)
26
- elif line and not first_message_found:
27
- # Skip other junk lines before the first message
28
- continue
29
-
30
- data = '\n'.join(cleaned_lines)
31
 
32
- # --- STEP 2: Split Messages and Dates ---
33
- messages = re.split(pattern, data)[1:]
34
- date = re.findall(pattern, data)
 
 
 
35
 
36
- print(f"Found {len(messages)} messages and {len(date)} dates")
37
-
38
- dates = [d.replace('\u202f', ' ').replace('\u00a0', ' ').strip() for d in date]
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  if len(messages) != len(dates) or len(messages) == 0:
41
  print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
 
42
  return None
43
 
44
  df = pd.DataFrame({'user_message': messages, 'message_date': dates})
45
 
46
- # --- STEP 3: Robust Date Parsing (Trying two common year formats) ---
47
- # Attempt 1: 4-digit year format (Standard for newer exports)
48
- format_4_digit_year = '%d/%m/%Y, %I:%M %p - '
49
 
50
- # Attempt 2: 2-digit year format (Used by some older exports)
51
- format_2_digit_year = '%d/%m/%y, %I:%M %p - '
52
-
53
- # We rely on pandas 'errors="coerce"' to tell us if the first format worked.
54
- df['date'] = pd.to_datetime(df['message_date'], format=format_4_digit_year, errors='coerce')
 
 
 
 
 
 
 
 
 
55
 
56
- # Find which dates failed to parse (NaT = Not a Time)
57
- unparsed_dates = df['date'].isna()
58
 
59
- # If any dates failed, try parsing them with the 2-digit year format
60
- if unparsed_dates.any():
61
- df.loc[unparsed_dates, 'date'] = pd.to_datetime(
62
- df.loc[unparsed_dates, 'message_date'],
63
- format=format_2_digit_year,
64
- errors='coerce'
65
- )
 
 
66
 
67
- # Drop rows where neither format worked
68
  df.dropna(subset=['date'], inplace=True)
69
  if df.empty:
70
- print("Error: DataFrame is empty after parsing dates. Neither 4-digit nor 2-digit year formats worked.")
71
  return None
72
 
73
- # --- STEP 4: Split User and Message Content ---
74
- users = []
75
- messages_list = []
76
- for message in df['user_message']:
77
- entry = re.split(r'([\w\W]+?):\s', message, 1)
78
- if len(entry) > 2:
79
- users.append(entry[1].strip())
80
- messages_list.append(entry[2].strip())
81
- else:
82
- users.append('group_notification')
83
- messages_list.append(entry[0].strip())
84
-
85
- df['user'] = users
86
- df['message'] = messages_list
87
-
88
- # --- STEP 5: Add Metadata Columns ---
89
- df.drop(columns=['user_message', 'message_date'], inplace=True)
90
  df['only_date'] = df['date'].dt.date
91
  df['year'] = df['date'].dt.year
92
  df['month_num'] = df['date'].dt.month
@@ -96,4 +103,5 @@ def preprocess(data):
96
  df['hour'] = df['date'].dt.hour
97
  df['minute'] = df['date'].dt.minute
98
 
 
99
  return df
 
5
  def preprocess(data):
6
  print("Preprocess started")
7
 
8
+ # NEW ROBUST REGEX PATTERN: Supports both 12-hour (H:MM AM/PM) and 24-hour (HH:MM) formats.
9
+ # It captures: Day/Month/Year, Space, Time (H:MM or HH:MM), optional AM/PM/unicode space, dash, space.
10
+ pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM|PM|am|pm))?) - '
11
 
12
+ # --- STEP 1: Separate metadata lines ---
13
+ # WhatsApp exports often have an initial line about end-to-end encryption.
14
  data_lines = data.split('\n')
15
  cleaned_lines = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # We strip out the encryption header line or any preceding junk
18
+ start_index = 0
19
+ for i, line in enumerate(data_lines):
20
+ if re.search(pattern, line):
21
+ start_index = i
22
+ break
23
 
24
+ # Join the message content back starting from the first actual chat line
25
+ data = '\n'.join(data_lines[start_index:])
26
+
27
+ # --- STEP 2: Split Messages and Dates (using the capturing groups in the pattern) ---
28
 
29
+ # Extract messages: split the entire data string by the pattern
30
+ messages = re.split(pattern, data)[3::3] # Take every 3rd element starting from the 3rd index (the message content)
31
+
32
+ # Extract date/time stamps (they are the 1st and 2nd capturing group of every match)
33
+ matches = re.findall(pattern, data)
34
+
35
+ dates = []
36
+ for match in matches:
37
+ date_part = match[0] # e.g., '19/11/2023'
38
+ time_part = match[1] # e.g., '07:43' or '8:09 am'
39
+ # Combine date and time, stripping the unicode space that often appears in the time part
40
+ combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip()
41
+ dates.append(combined_dt)
42
+
43
+ print(f"Found {len(messages)} messages and {len(dates)} dates")
44
+
45
  if len(messages) != len(dates) or len(messages) == 0:
46
  print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.")
47
+ # Returning None ensures Streamlit handles the parsing failure gracefully.
48
  return None
49
 
50
  df = pd.DataFrame({'user_message': messages, 'message_date': dates})
51
 
52
+ # --- STEP 3: Robust Date Parsing (Trying 12h, 24h, and 2/4 digit year formats) ---
 
 
53
 
54
+ # 1. Standard 12-hour format (e.g., 01/01/2025, 8:09 AM) - Robust Year
55
+ format_12h_4y = '%d/%m/%Y, %I:%M %p'
56
+ # 2. Standard 24-hour format (e.g., 19/11/2023, 07:43) - Robust Year
57
+ format_24h_4y = '%d/%m/%Y, %H:%M'
58
+ # 3. Standard 12-hour format - 2 Digit Year
59
+ format_12h_2y = '%d/%m/%y, %I:%M %p'
60
+ # 4. Standard 24-hour format - 2 Digit Year
61
+ format_24h_2y = '%d/%m/%y, %H:%M'
62
+
63
+ # Convert 'message_date' column to list of strings for processing
64
+ date_series = df['message_date']
65
+
66
+ # Initialize 'date' column with NaT (Not a Time)
67
+ df['date'] = pd.NaT
68
 
69
+ # List of formats to try, in order of likelihood
70
+ formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y]
71
 
72
+ for format_str in formats_to_try:
73
+ unparsed = df['date'].isna()
74
+ if unparsed.any():
75
+ # Try parsing the remaining unparsed dates with the current format string
76
+ df.loc[unparsed, 'date'] = pd.to_datetime(
77
+ df.loc[unparsed, 'message_date'],
78
+ format=format_str,
79
+ errors='coerce'
80
+ )
81
 
82
+ # Drop rows where parsing failed with all formats
83
  df.dropna(subset=['date'], inplace=True)
84
  if df.empty:
85
+ print("Error: DataFrame is empty after parsing dates. All date formats failed.")
86
  return None
87
 
88
+ df.rename(columns={'message_date': 'timestamp_string'}, inplace=True)
89
+ df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification')
90
+ df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip())
91
+
92
+ # Clean up group notifications and drops
93
+ df.drop(columns=['user_message'], inplace=True)
94
+ df = df[df['user'] != 'group_notification'].copy()
95
+
96
+ # --- STEP 4: Add Metadata Columns ---
 
 
 
 
 
 
 
 
97
  df['only_date'] = df['date'].dt.date
98
  df['year'] = df['date'].dt.year
99
  df['month_num'] = df['date'].dt.month
 
103
  df['hour'] = df['date'].dt.hour
104
  df['minute'] = df['date'].dt.minute
105
 
106
+ print(f"Preprocess finished with {df.shape[0]} valid messages.")
107
  return df