|
|
import re |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
|
|
|
def preprocess(data): |
|
|
print("Preprocess started") |
|
|
|
|
|
|
|
|
|
|
|
pattern = r'(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?:[\s\u202f\u00a0]?(?:AM|PM|am|pm))?) - ' |
|
|
|
|
|
|
|
|
|
|
|
data_lines = data.split('\n') |
|
|
cleaned_lines = [] |
|
|
|
|
|
|
|
|
start_index = 0 |
|
|
for i, line in enumerate(data_lines): |
|
|
if re.search(pattern, line): |
|
|
start_index = i |
|
|
break |
|
|
|
|
|
|
|
|
data = '\n'.join(data_lines[start_index:]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
messages = re.split(pattern, data)[3::3] |
|
|
|
|
|
|
|
|
matches = re.findall(pattern, data) |
|
|
|
|
|
dates = [] |
|
|
for match in matches: |
|
|
date_part = match[0] |
|
|
time_part = match[1] |
|
|
|
|
|
combined_dt = f"{date_part}, {time_part}".replace('\u202f', ' ').replace('\u00a0', ' ').strip() |
|
|
dates.append(combined_dt) |
|
|
|
|
|
print(f"Found {len(messages)} messages and {len(dates)} dates") |
|
|
|
|
|
if len(messages) != len(dates) or len(messages) == 0: |
|
|
print(f"Error: Mismatched number of messages ({len(messages)}) and dates ({len(dates)}). Returning None.") |
|
|
|
|
|
return None |
|
|
|
|
|
df = pd.DataFrame({'user_message': messages, 'message_date': dates}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
format_12h_4y = '%d/%m/%Y, %I:%M %p' |
|
|
|
|
|
format_24h_4y = '%d/%m/%Y, %H:%M' |
|
|
|
|
|
format_12h_2y = '%d/%m/%y, %I:%M %p' |
|
|
|
|
|
format_24h_2y = '%d/%m/%y, %H:%M' |
|
|
|
|
|
|
|
|
date_series = df['message_date'] |
|
|
|
|
|
|
|
|
df['date'] = pd.NaT |
|
|
|
|
|
|
|
|
formats_to_try = [format_12h_4y, format_24h_4y, format_12h_2y, format_24h_2y] |
|
|
|
|
|
for format_str in formats_to_try: |
|
|
unparsed = df['date'].isna() |
|
|
if unparsed.any(): |
|
|
|
|
|
df.loc[unparsed, 'date'] = pd.to_datetime( |
|
|
df.loc[unparsed, 'message_date'], |
|
|
format=format_str, |
|
|
errors='coerce' |
|
|
) |
|
|
|
|
|
|
|
|
df.dropna(subset=['date'], inplace=True) |
|
|
if df.empty: |
|
|
print("Error: DataFrame is empty after parsing dates. All date formats failed.") |
|
|
return None |
|
|
|
|
|
df.rename(columns={'message_date': 'timestamp_string'}, inplace=True) |
|
|
df['user'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[1].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else 'group_notification') |
|
|
df['message'] = df['user_message'].apply(lambda x: re.split(r'([\w\W]+?):\s', x, 1)[2].strip() if len(re.split(r'([\w\W]+?):\s', x, 1)) > 2 else x.strip()) |
|
|
|
|
|
|
|
|
df.drop(columns=['user_message'], inplace=True) |
|
|
df = df[df['user'] != 'group_notification'].copy() |
|
|
|
|
|
|
|
|
df['only_date'] = df['date'].dt.date |
|
|
df['year'] = df['date'].dt.year |
|
|
df['month_num'] = df['date'].dt.month |
|
|
df['month'] = df['date'].dt.month_name() |
|
|
df['day'] = df['date'].dt.day |
|
|
df['day_name'] = df['date'].dt.day_name() |
|
|
df['hour'] = df['date'].dt.hour |
|
|
df['minute'] = df['date'].dt.minute |
|
|
|
|
|
print(f"Preprocess finished with {df.shape[0]} valid messages.") |
|
|
return df |