Update src/preprocessor.py
Browse files- src/preprocessor.py +90 -45
src/preprocessor.py
CHANGED
|
@@ -2,74 +2,107 @@ import re
|
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
def preprocess(data):
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
pattern
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
date = re.findall(pattern, data, flags=re.I)
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return None
|
| 24 |
|
| 25 |
-
# Clean up dates
|
| 26 |
-
|
| 27 |
-
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
|
| 28 |
-
|
| 29 |
-
# --- Date Parsing: Try common formats ---
|
| 30 |
-
known_formats = [
|
| 31 |
-
'%d/%m/%y, %I:%M %p - ', # Format without seconds (e.g., 01/01/25, 8:09 am)
|
| 32 |
-
'%d/%m/%Y, %I:%M %p - ', # Format without seconds (4-digit year)
|
| 33 |
-
'%d/%m/%Y, %I:%M:%S %p - ' # Format with seconds (4-digit year)
|
| 34 |
-
]
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
df['date'] = pd.NaT
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
df['date'] = df['date'].fillna(converted)
|
| 41 |
|
| 42 |
-
if df['date'].isna().any():
|
| 43 |
-
print("Warning: Date parsing failed for some rows. Rows without a valid date will be dropped.")
|
| 44 |
-
|
| 45 |
df.rename(columns={'date': 'date'}, inplace=True)
|
|
|
|
| 46 |
|
| 47 |
-
# --- Extract Users and Messages ---
|
| 48 |
users = []
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# Regex to capture sender name non-greedily up to the colon separator.
|
| 52 |
-
user_pattern = r'^([\w\W]+?):\s'
|
| 53 |
|
| 54 |
for message in df['user_message']:
|
|
|
|
| 55 |
entry = re.split(user_pattern, message, maxsplit=1)
|
| 56 |
|
| 57 |
-
if len(entry) > 1:
|
| 58 |
users.append(entry[1].strip())
|
| 59 |
-
|
| 60 |
-
else:
|
| 61 |
users.append('group_notification')
|
| 62 |
-
|
| 63 |
|
| 64 |
df['user'] = users
|
| 65 |
-
df['message'] =
|
|
|
|
| 66 |
|
| 67 |
-
# ---
|
| 68 |
-
df.drop(columns=['user_message', 'message_date'], inplace=True)
|
| 69 |
-
|
| 70 |
-
df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
|
| 71 |
-
|
| 72 |
-
# Extract temporal features
|
| 73 |
df['only_date'] = df['date'].dt.date
|
| 74 |
df['year'] = df['date'].dt.year
|
| 75 |
df['month_num'] = df['date'].dt.month
|
|
@@ -79,4 +112,16 @@ def preprocess(data):
|
|
| 79 |
df['hour'] = df['date'].dt.hour
|
| 80 |
df['minute'] = df['date'].dt.minute
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
return df
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
def preprocess(data):
|
| 5 |
+
# --- 1. Define Flexible Regex Pattern ---
|
| 6 |
+
# The pattern is made flexible to accommodate:
|
| 7 |
+
# 1. 2-digit or 4-digit year (\d{2,4}).
|
| 8 |
+
# 2. Optional seconds (?::\d{2})?.
|
| 9 |
+
# 3. Optional AM/PM indicator with flexible spacing/non-breaking spaces (\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm))?.
|
| 10 |
+
# 4. The initial time component must match both 24H (00-23) and 12H (01-12) formats.
|
| 11 |
+
# We use non-capturing groups (?:...) where applicable.
|
| 12 |
+
|
| 13 |
+
# This pattern captures the full date-time string without explicitly using AM/PM in the regex capture,
|
| 14 |
+
# as pandas handles that in the final step.
|
| 15 |
+
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
|
| 16 |
+
|
| 17 |
+
# --- 2. Split Messages and Extract Date Strings ---
|
| 18 |
+
# re.split uses capturing groups from the pattern, leading to the structure: [noise, date_part1, date_part2, ..., message, date_part1, ...].
|
| 19 |
+
# We use list comprehension to reconstruct the raw timestamp string and extract the message content.
|
| 20 |
|
| 21 |
+
# Find all matches of the full raw date string for later alignment.
|
| 22 |
+
# The pattern is complex, so we will use re.split/re.findall to extract the message boundaries.
|
| 23 |
|
| 24 |
+
# Re-running findall, but being extra permissive to capture the whole raw date string for later parsing
|
| 25 |
+
date_pattern_raw = r'(\d{1,2}/\d{1,2}/\d{2,4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*'
|
| 26 |
|
| 27 |
+
dates = re.findall(date_pattern_raw, data, flags=re.I)
|
|
|
|
| 28 |
|
| 29 |
+
# We now split the data to get the message content parts
|
| 30 |
+
messages = re.split(date_pattern_raw, data, flags=re.I)
|
| 31 |
+
messages = [m.strip() for m in messages if m.strip()]
|
| 32 |
|
| 33 |
+
# After splitting, the list contains interleaved date groups (3 groups) and message content.
|
| 34 |
+
# Reconstruct the raw date strings from the groups that were successfully found.
|
| 35 |
+
# The raw string is comprised of the Date, Time, and optional AM/PM groups.
|
| 36 |
+
raw_dates = []
|
| 37 |
+
# Index 0 is the leading noise. Date groups start at index 1 and repeat every 4 indices.
|
| 38 |
+
# The structure is [Noise, D1, T1, AP1, M1, D2, T2, AP2, M2, ...]
|
| 39 |
+
|
| 40 |
+
# The simple split approach used before is very fragile with capturing groups.
|
| 41 |
+
# Let's revert to a non-capturing split, then reconstruct the dates to keep it simple.
|
| 42 |
+
|
| 43 |
+
# Use a non-capturing group for the split to get clean message blocks.
|
| 44 |
+
message_blocks = re.split(r'(?:\d{1,2}/\d{1,2}/\d{2,4}),\s*\d{1,2}:\d{2}(?::\d{2})?\s*[\s\u202f\u00a0]?(?:AM|PM|am|pm)?\s*-\s*', data, flags=re.I)[1:]
|
| 45 |
+
|
| 46 |
+
# Re-find the full date strings to ensure alignment.
|
| 47 |
+
raw_date_strings = re.findall(date_pattern_raw, data, flags=re.I)
|
| 48 |
+
|
| 49 |
+
if len(message_blocks) != len(raw_date_strings):
|
| 50 |
+
# This occurs if non-date text exists before the first date in the file,
|
| 51 |
+
# which is handled by removing the first split element, or due to a bad pattern match.
|
| 52 |
+
print("Error: Message and date counts are mismatched. Check the first line of the chat file.")
|
| 53 |
return None
|
| 54 |
|
| 55 |
+
# Clean up dates by joining and removing non-breaking spaces
|
| 56 |
+
dates_for_df = [" ".join(d).strip().replace('\u202f', ' ').replace('\u00a0', ' ') for d in raw_date_strings]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
df = pd.DataFrame({'user_message': message_blocks, 'message_date': dates_for_df})
|
| 59 |
+
|
| 60 |
+
# --- 3. Robust Date Parsing ---
|
| 61 |
+
# We define all possible expected formats (24H, 12H, with/without seconds, 2/4-digit year)
|
| 62 |
+
DATE_FORMATS = [
|
| 63 |
+
# 12-Hour Formats (with AM/PM)
|
| 64 |
+
'%d/%m/%y, %I:%M %p - ',
|
| 65 |
+
'%d/%m/%Y, %I:%M %p - ',
|
| 66 |
+
'%d/%m/%y, %I:%M:%S %p - ',
|
| 67 |
+
'%d/%m/%Y, %I:%M:%S %p - ',
|
| 68 |
+
|
| 69 |
+
# 24-Hour Formats (without AM/PM)
|
| 70 |
+
'%d/%m/%y, %H:%M - ',
|
| 71 |
+
'%d/%m/%Y, %H:%M - ',
|
| 72 |
+
'%d/%m/%y, %H:%M:%S - ',
|
| 73 |
+
'%d/%m/%Y, %H:%M:%S - '
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
df['date'] = pd.NaT
|
| 77 |
+
# Iterate through all known formats, filling successful conversions and coercing failures to NaT
|
| 78 |
+
for fmt in DATE_FORMATS:
|
| 79 |
+
converted = pd.to_datetime(df['message_date'], format=fmt.strip(), errors='coerce')
|
| 80 |
df['date'] = df['date'].fillna(converted)
|
| 81 |
|
|
|
|
|
|
|
|
|
|
| 82 |
df.rename(columns={'date': 'date'}, inplace=True)
|
| 83 |
+
df.dropna(subset=['date'], inplace=True) # Drop rows where date parsing failed
|
| 84 |
|
| 85 |
+
# --- 4. Extract Users and Messages ---
|
| 86 |
users = []
|
| 87 |
+
messages = []
|
| 88 |
+
user_pattern = r'^([\w\W]+?):\s'
|
|
|
|
|
|
|
| 89 |
|
| 90 |
for message in df['user_message']:
|
| 91 |
+
# Split message into [Noise, User, Message Content]
|
| 92 |
entry = re.split(user_pattern, message, maxsplit=1)
|
| 93 |
|
| 94 |
+
if len(entry) > 1:
|
| 95 |
users.append(entry[1].strip())
|
| 96 |
+
messages.append(entry[2].strip())
|
| 97 |
+
else:
|
| 98 |
users.append('group_notification')
|
| 99 |
+
messages.append(entry[0].strip())
|
| 100 |
|
| 101 |
df['user'] = users
|
| 102 |
+
df['message'] = messages
|
| 103 |
+
df.drop(columns=['user_message'], inplace=True)
|
| 104 |
|
| 105 |
+
# --- 5. Extract Temporal Features ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
df['only_date'] = df['date'].dt.date
|
| 107 |
df['year'] = df['date'].dt.year
|
| 108 |
df['month_num'] = df['date'].dt.month
|
|
|
|
| 112 |
df['hour'] = df['date'].dt.hour
|
| 113 |
df['minute'] = df['date'].dt.minute
|
| 114 |
|
| 115 |
+
# --- 6. Calculate Message Period ---
|
| 116 |
+
period = []
|
| 117 |
+
for hour in df['hour']:
|
| 118 |
+
if hour == 23:
|
| 119 |
+
period.append(str(hour) + "-" + str('00'))
|
| 120 |
+
elif hour == 0:
|
| 121 |
+
period.append(str('00') + "-" + str(hour + 1))
|
| 122 |
+
else:
|
| 123 |
+
period.append(str(hour) + "-" + str(hour + 1))
|
| 124 |
+
|
| 125 |
+
df['period'] = period
|
| 126 |
+
|
| 127 |
return df
|